class color_class:
BOLD_COLOR = '\033[1m' + '\033[93m' + '\033[94m' + '\033[95m' + '\033[91m' + '\033[92m'
BOLD = '\033[1m'
#END = '\033[0m'
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
END = '\033[0m'
# BOLD = '\033[1m'
UNDERLINE = '\033[4m'
print(color_class.OKBLUE + '\nImporting all the required libraries....\n\n'+ color_class.END)
import warnings
warnings.filterwarnings("ignore")
# Base libraries
import os
import numpy as np
import pandas as pd
import re
import string
import glob
import math
from IPython.display import display_html
import tqdm
!pip install wandb
import wandb
## visualization libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as patches
import seaborn as sns
!pip install pywaffle
from pywaffle import Waffle
# stat tools
import statsmodels.api as sm
from scipy.stats import kurtosis, skew
## preprocessing & otherlibraries
from sklearn.model_selection import (train_test_split,
cross_val_score,
StratifiedKFold,
GridSearchCV)
from sklearn.preprocessing import (StandardScaler,
MinMaxScaler,
RobustScaler)
## data sampling and outlier detection libraries
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
!pip install umap-learn[plot]
!pip install holoviews
!pip install -U ipykernel
from umap import UMAP
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE,RandomOverSampler
# modeling
from sklearn.linear_model import (LinearRegression,
LogisticRegression)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, plot_importance, early_stopping
from sklearn.ensemble import (AdaBoostClassifier,
ExtraTreesClassifier,
RandomForestClassifier,
GradientBoostingClassifier)
# metrics
from sklearn.metrics import (r2_score,
accuracy_score,
roc_auc_score,
f1_score,
recall_score,
precision_score,
recall_score,
confusion_matrix)
Importing all the required libraries.... Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting wandb Downloading wandb-0.15.0-py3-none-any.whl (2.0 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 54.4 MB/s eta 0:00:00 Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.9/dist-packages (from wandb) (1.4.4) Requirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (2.27.1) Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (8.1.3) Requirement already satisfied: PyYAML in /usr/local/lib/python3.9/dist-packages (from wandb) (6.0) Collecting sentry-sdk>=1.0.0 Downloading sentry_sdk-1.20.0-py2.py3-none-any.whl (198 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 198.8/198.8 kB 19.5 MB/s eta 0:00:00 Collecting docker-pycreds>=0.4.0 Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB) Requirement already satisfied: protobuf!=4.21.0,<5,>=3.15.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (3.20.3) Collecting pathtools Downloading pathtools-0.1.2.tar.gz (11 kB) Preparing metadata (setup.py) ... done Collecting GitPython!=3.1.29,>=1.0.0 Downloading GitPython-3.1.31-py3-none-any.whl (184 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 184.3/184.3 kB 18.2 MB/s eta 0:00:00 Collecting setproctitle Downloading setproctitle-1.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB) Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from wandb) (67.6.1) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from wandb) (4.5.0) Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.9/dist-packages (from wandb) (5.9.5) Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.9/dist-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0) Collecting gitdb<5,>=4.0.1 Downloading gitdb-4.0.10-py3-none-any.whl (62 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.7/62.7 kB 7.9 MB/s eta 0:00:00 Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (1.26.15) Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (2.0.12) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (2022.12.7) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests<3,>=2.0.0->wandb) (3.4) Collecting smmap<6,>=3.0.1 Downloading smmap-5.0.0-py3-none-any.whl (24 kB) Building wheels for collected packages: pathtools Building wheel for pathtools (setup.py) ... done Created wheel for pathtools: filename=pathtools-0.1.2-py3-none-any.whl size=8807 sha256=d53ae6f4cd2a77d4a60ec6378c509f2c0c9d068d58a868c49c0fc15cd55b2649 Stored in directory: /root/.cache/pip/wheels/b7/0a/67/ada2a22079218c75a88361c0782855cc72aebc4d18d0289d05 Successfully built pathtools Installing collected packages: pathtools, smmap, setproctitle, sentry-sdk, docker-pycreds, gitdb, GitPython, wandb Successfully installed GitPython-3.1.31 docker-pycreds-0.4.0 gitdb-4.0.10 pathtools-0.1.2 sentry-sdk-1.20.0 setproctitle-1.3.2 smmap-5.0.0 wandb-0.15.0 Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting pywaffle Downloading pywaffle-1.1.0-py2.py3-none-any.whl (30 kB) Collecting fontawesomefree Downloading fontawesomefree-6.4.0-py3-none-any.whl (25.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25.2/25.2 MB 58.6 MB/s eta 0:00:00 Requirement already satisfied: matplotlib in /usr/local/lib/python3.9/dist-packages (from pywaffle) (3.7.1) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (3.0.9) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (0.11.0) Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (8.4.0) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (4.39.3) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (1.4.4) Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (5.12.0) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (1.0.7) Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (2.8.2) Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (1.22.4) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->pywaffle) (23.1) Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib->pywaffle) (3.15.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7->matplotlib->pywaffle) (1.16.0) Installing collected packages: fontawesomefree, pywaffle Successfully installed fontawesomefree-6.4.0 pywaffle-1.1.0 Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting umap-learn[plot] Downloading umap-learn-0.5.3.tar.gz (88 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 88.2/88.2 kB 6.5 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.22.4) Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.2.2) Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.10.1) Requirement already satisfied: numba>=0.49 in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (0.56.4) Collecting pynndescent>=0.5 Downloading pynndescent-0.5.9.tar.gz (1.1 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.1/1.1 MB 53.3 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (4.65.0) Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.5.3) Requirement already satisfied: matplotlib in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (3.7.1) Collecting datashader Downloading datashader-0.14.4-py2.py3-none-any.whl (18.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.2/18.2 MB 75.5 MB/s eta 0:00:00 Requirement already satisfied: bokeh in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (2.4.3) Requirement already satisfied: holoviews in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (1.15.4) Requirement already satisfied: colorcet in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (3.0.1) Requirement already satisfied: seaborn in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (0.12.2) Requirement already satisfied: scikit-image in /usr/local/lib/python3.9/dist-packages (from umap-learn[plot]) (0.19.3) Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from numba>=0.49->umap-learn[plot]) (67.6.1) Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.9/dist-packages (from numba>=0.49->umap-learn[plot]) (0.39.1) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.9/dist-packages (from pynndescent>=0.5->umap-learn[plot]) (1.2.0) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.22->umap-learn[plot]) (3.1.0) Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (23.1) Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (6.2) Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (6.0) Requirement already satisfied: typing-extensions>=3.10.0 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (4.5.0) Requirement already satisfied: pillow>=7.1.0 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (8.4.0) Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.9/dist-packages (from bokeh->umap-learn[plot]) (3.1.2) Requirement already satisfied: pyct>=0.4.4 in /usr/local/lib/python3.9/dist-packages (from colorcet->umap-learn[plot]) (0.5.0) Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (2.27.1) Requirement already satisfied: toolz in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (0.12.0) Requirement already satisfied: param in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (1.13.0) Requirement already satisfied: xarray in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (2022.12.0) Collecting datashape Downloading datashape-0.5.2.tar.gz (76 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 76.5/76.5 kB 7.7 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done Requirement already satisfied: dask in /usr/local/lib/python3.9/dist-packages (from datashader->umap-learn[plot]) (2022.12.1) Requirement already satisfied: panel>=0.13.1 in /usr/local/lib/python3.9/dist-packages (from holoviews->umap-learn[plot]) (0.14.4) Requirement already satisfied: pyviz-comms>=0.7.4 in /usr/local/lib/python3.9/dist-packages (from holoviews->umap-learn[plot]) (2.2.1) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->umap-learn[plot]) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->umap-learn[plot]) (2022.7.1) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (3.0.9) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (4.39.3) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (1.0.7) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (1.4.4) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (0.11.0) Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib->umap-learn[plot]) (5.12.0) Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (1.4.1) Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (2.25.1) Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (3.1) Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.9/dist-packages (from scikit-image->umap-learn[plot]) (2023.4.12) Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib->umap-learn[plot]) (3.15.0) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from Jinja2>=2.9->bokeh->umap-learn[plot]) (2.1.2) Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews->umap-learn[plot]) (6.0.0) Requirement already satisfied: markdown in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews->umap-learn[plot]) (3.4.3) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->umap-learn[plot]) (1.16.0) Requirement already satisfied: fsspec>=0.6.0 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (2023.4.0) Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (8.1.3) Requirement already satisfied: cloudpickle>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (2.2.1) Requirement already satisfied: partd>=0.3.10 in /usr/local/lib/python3.9/dist-packages (from dask->datashader->umap-learn[plot]) (1.4.0) Requirement already satisfied: multipledispatch>=0.4.7 in /usr/local/lib/python3.9/dist-packages (from datashape->datashader->umap-learn[plot]) (0.6.0) Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (2.0.12) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (2022.12.7) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (1.26.15) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->datashader->umap-learn[plot]) (3.4) Requirement already satisfied: locket in /usr/local/lib/python3.9/dist-packages (from partd>=0.3.10->dask->datashader->umap-learn[plot]) (1.0.0) Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->panel>=0.13.1->holoviews->umap-learn[plot]) (0.5.1) Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.9/dist-packages (from markdown->panel>=0.13.1->holoviews->umap-learn[plot]) (6.4.1) Building wheels for collected packages: pynndescent, umap-learn, datashape Building wheel for pynndescent (setup.py) ... done Created wheel for pynndescent: filename=pynndescent-0.5.9-py3-none-any.whl size=55620 sha256=48de31d05c1c64c9016e25ff61046b510e0bdbf6b89f4437e758d0ec73d3b451 Stored in directory: /root/.cache/pip/wheels/eb/f2/e3/b8e73d1488d8284d88c9283411561b65bd4f0200abf131a946 Building wheel for umap-learn (setup.py) ... done Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82830 sha256=118f2bf9d535cd8db9192685403e67da93706bd30ee5c651f96996bc824602c1 Stored in directory: /root/.cache/pip/wheels/f4/3e/1c/596d0a463d17475af648688443fa4846fef624d1390339e7e9 Building wheel for datashape (setup.py) ... done Created wheel for datashape: filename=datashape-0.5.2-py3-none-any.whl size=59436 sha256=61e9c0317eac03f5ec9f2e7586be577a4ad369947ec69050f65bff6f442d9728 Stored in directory: /root/.cache/pip/wheels/42/ef/d7/781cf80d4146d76b3d2ed2510113d78c2643c842cc6c22918d Successfully built pynndescent umap-learn datashape Installing collected packages: datashape, pynndescent, umap-learn, datashader Successfully installed datashader-0.14.4 datashape-0.5.2 pynndescent-0.5.9 umap-learn-0.5.3 Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: holoviews in /usr/local/lib/python3.9/dist-packages (1.15.4) Requirement already satisfied: pyviz-comms>=0.7.4 in /usr/local/lib/python3.9/dist-packages (from holoviews) (2.2.1) Requirement already satisfied: panel>=0.13.1 in /usr/local/lib/python3.9/dist-packages (from holoviews) (0.14.4) Requirement already satisfied: param<2.0,>=1.9.3 in /usr/local/lib/python3.9/dist-packages (from holoviews) (1.13.0) Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from holoviews) (23.1) Requirement already satisfied: pandas>=0.20.0 in /usr/local/lib/python3.9/dist-packages (from holoviews) (1.5.3) Requirement already satisfied: numpy>=1.0 in /usr/local/lib/python3.9/dist-packages (from holoviews) (1.22.4) Requirement already satisfied: colorcet in /usr/local/lib/python3.9/dist-packages (from holoviews) (3.0.1) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=0.20.0->holoviews) (2022.7.1) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=0.20.0->holoviews) (2.8.2) Requirement already satisfied: markdown in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (3.4.3) Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (2.27.1) Requirement already satisfied: bokeh<2.5.0,>=2.4.0 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (2.4.3) Requirement already satisfied: pyct>=0.4.4 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (0.5.0) Requirement already satisfied: bleach in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (6.0.0) Requirement already satisfied: setuptools>=42 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (67.6.1) Requirement already satisfied: tqdm>=4.48.0 in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (4.65.0) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from panel>=0.13.1->holoviews) (4.5.0) Requirement already satisfied: pillow>=7.1.0 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (8.4.0) Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (6.0) Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (3.1.2) Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.9/dist-packages (from bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (6.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas>=0.20.0->holoviews) (1.16.0) Requirement already satisfied: webencodings in /usr/local/lib/python3.9/dist-packages (from bleach->panel>=0.13.1->holoviews) (0.5.1) Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.9/dist-packages (from markdown->panel>=0.13.1->holoviews) (6.4.1) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (2022.12.7) Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (2.0.12) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (3.4) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->panel>=0.13.1->holoviews) (1.26.15) Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.9/dist-packages (from importlib-metadata>=4.4->markdown->panel>=0.13.1->holoviews) (3.15.0) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from Jinja2>=2.9->bokeh<2.5.0,>=2.4.0->panel>=0.13.1->holoviews) (2.1.2) Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: ipykernel in /usr/local/lib/python3.9/dist-packages (5.5.6) Collecting ipykernel Downloading ipykernel-6.22.0-py3-none-any.whl (149 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.0/150.0 kB 9.8 MB/s eta 0:00:00 Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (5.3.0) Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from ipykernel) (23.1) Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.9/dist-packages (from ipykernel) (1.5.6) Requirement already satisfied: matplotlib-inline>=0.1 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (0.1.6) Requirement already satisfied: psutil in /usr/local/lib/python3.9/dist-packages (from ipykernel) (5.9.5) Requirement already satisfied: ipython>=7.23.1 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (7.34.0) Collecting comm>=0.1.1 Downloading comm-0.1.3-py3-none-any.whl (6.6 kB) Requirement already satisfied: debugpy>=1.6.5 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (1.6.6) Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (6.1.12) Requirement already satisfied: traitlets>=5.4.0 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (5.7.1) Requirement already satisfied: tornado>=6.1 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (6.2) Requirement already satisfied: pyzmq>=20 in /usr/local/lib/python3.9/dist-packages (from ipykernel) (23.2.1) Requirement already satisfied: backcall in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (0.2.0) Requirement already satisfied: decorator in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (4.4.2) Collecting jedi>=0.16 Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 71.3 MB/s eta 0:00:00 Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (3.0.38) Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (67.6.1) Requirement already satisfied: pickleshare in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (0.7.5) Requirement already satisfied: pygments in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (2.14.0) Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.9/dist-packages (from ipython>=7.23.1->ipykernel) (4.8.0) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.9/dist-packages (from jupyter-client>=6.1.12->ipykernel) (2.8.2) Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.9/dist-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (3.2.0) Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.9/dist-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel) (0.8.3) Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.9/dist-packages (from pexpect>4.3->ipython>=7.23.1->ipykernel) (0.7.0) Requirement already satisfied: wcwidth in /usr/local/lib/python3.9/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.23.1->ipykernel) (0.2.6) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.1->jupyter-client>=6.1.12->ipykernel) (1.16.0) Installing collected packages: jedi, comm, ipykernel Attempting uninstall: ipykernel Found existing installation: ipykernel 5.5.6 Uninstalling ipykernel-5.5.6: Successfully uninstalled ipykernel-5.5.6 ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. google-colab 1.0.0 requires ipykernel~=5.5.6, but you have ipykernel 6.22.0 which is incompatible. Successfully installed comm-0.1.3 ipykernel-6.22.0 jedi-0.18.2
#feature selection and model interpretaiton
!pip install shap
import shap
!pip install eli5
import eli5
from eli5.sklearn import PermutationImportance
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
Downloading shap-0.41.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 572.4/572.4 kB 21.3 MB/s eta 0:00:00
Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from shap) (1.22.4)
Requirement already satisfied: numba in /usr/local/lib/python3.9/dist-packages (from shap) (0.56.4)
Requirement already satisfied: cloudpickle in /usr/local/lib/python3.9/dist-packages (from shap) (2.2.1)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.9/dist-packages (from shap) (1.2.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from shap) (1.5.3)
Collecting slicer==0.0.7
Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Requirement already satisfied: tqdm>4.25.0 in /usr/local/lib/python3.9/dist-packages (from shap) (4.65.0)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from shap) (1.10.1)
Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.9/dist-packages (from shap) (23.1)
Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from numba->shap) (67.6.1)
Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.9/dist-packages (from numba->shap) (0.39.1)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2022.7.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (3.1.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (1.2.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->shap) (1.16.0)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eli5
Downloading eli5-0.13.0.tar.gz (216 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 216.2/216.2 kB 14.3 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Requirement already satisfied: attrs>17.1.0 in /usr/local/lib/python3.9/dist-packages (from eli5) (23.1.0)
Requirement already satisfied: jinja2>=3.0.0 in /usr/local/lib/python3.9/dist-packages (from eli5) (3.1.2)
Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.9/dist-packages (from eli5) (1.22.4)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from eli5) (1.10.1)
Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from eli5) (1.16.0)
Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.9/dist-packages (from eli5) (1.2.2)
Requirement already satisfied: graphviz in /usr/local/lib/python3.9/dist-packages (from eli5) (0.20.1)
Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.9/dist-packages (from eli5) (0.8.10)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2>=3.0.0->eli5) (2.1.2)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20->eli5) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20->eli5) (3.1.0)
Building wheels for collected packages: eli5
Building wheel for eli5 (setup.py) ... done
Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107747 sha256=a3cccdc9345c994f91535f1229c707a493f5bb685d3b36589f336571ac0bd518
Stored in directory: /root/.cache/pip/wheels/7b/26/a5/8460416695a992a2966b41caa5338e5e7fcea98c9d032d055c
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0
!pip install vecstack
from vecstack import stacking
## plot settings
sns.set_style('white')
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['axes.spines.left'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = False
plt.rcParams.update({'font.size':14})
plt.rcParams['font.weight']= 'normal'
print(color_class.BOLD + 'Done!!')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vecstack
Downloading vecstack-0.4.0.tar.gz (18 kB)
Preparing metadata (setup.py) ... done
Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from vecstack) (1.22.4)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from vecstack) (1.10.1)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.9/dist-packages (from vecstack) (1.2.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.18->vecstack) (3.1.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.18->vecstack) (1.2.0)
Building wheels for collected packages: vecstack
Building wheel for vecstack (setup.py) ... done
Created wheel for vecstack: filename=vecstack-0.4.0-py3-none-any.whl size=19879 sha256=23693121a7841649be4d9ba5ab6b9448fae447972c90dd52e5b21d5061ebb160
Stored in directory: /root/.cache/pip/wheels/7e/ee/d6/47cb94a403bc544de1433986e5530d6b0498021098fbe43aa1
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0
Done!!
import pandas as pd
from google.colab import files
uploaded = files.upload()
Saving breast-cancer-wisconsin.names to breast-cancer-wisconsin (1).names
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
colors= ['#e5b6f9' ,'#6fb3a9' ,'#eda760' ,'#c6e699' ,'#fff4af','#000000']
sns.palplot(colors,size = 3)
plt.gcf().set_size_inches(15,5)
plt.text(-0.75,-0.75, 'Women and Cancer: Color Palette',{'fontfamily':'serif', 'size':24, 'weight':'bold'})
plt.text(-0.75,-0.68, 'Lets try to stick to these colors throughout presentation.',{'fontfamily':'serif', 'size':16},alpha = 0.9)
for idx,values in enumerate(colors):
plt.text(idx-0.25,0, colors[idx],{'fontfamily':'serif', 'size':16, 'weight':'bold','color':'black'}, alpha =0.8)
plt.gcf().set_facecolor('white')
plt.box(None)
plt.axis('off')
plt.text(3.5,0.65,'© Made by Milon',{'fontfamily':'serif', 'size':10, 'color':'black'})
plt.show()
# Null accuracy Score for current data
NUll_acc = round (max(df.diagnosis.values.mean(), 1 - df.diagnosis.values.mean()), 2)
print(color_class.BOLD +'\nNull Accuracy Score: '+ color_class.END \
+color_class.OKGREEN + str(NUll_acc) + color_class.END + '\n' )
print(color_class.OKGREEN + 'This is the Baseline our model need to cross.\n'+ color_class.END)
Null Accuracy Score: 0.63 This is the Baseline our model need to cross.
feat_df = df.drop(columns = ['id', 'diagnosis'])
tar_df = df['diagnosis']
cancer_dist = round(tar_df.value_counts(normalize = True),2)*100
fig = plt.figure(FigureClass = Waffle,
constrained_layout = True,
figsize = (8,5),
facecolor = 'white',dpi = 100,
plots = {'111':
{
'rows':10,
'columns': 10,
'values' : [cancer_dist.values[0],cancer_dist.values[1]],
'colors' : [colors[1],colors[0]],
'vertical' : True,
'interval_ratio_y': 0.2,
'interval_ratio_x': 0.2,
'icon_legend': False,
'icon_size':5,
'plot_anchor':'C',
},
})
## labeling
fig.text(0.36,0.725, '{}%'.format(cancer_dist.values[1]), {'fontfamily':'serif','size':20, 'weight':'bold', 'color':colors[5]})
fig.text(0.625,0.36, '{}%'.format(cancer_dist.values[0]),{'fontfamily':'serif','size':20, 'weight':'bold','color':colors[5]})
## titles and text
#fig.text(-0.1,1.035,'Women and Cancer: How Susceptable Are Women To Breast Cancer?', {'font':'serif','size':18, 'weight':'bold'}, alpha = 1)
#fig.text(-0.1,0.96,'''Its really sad to see nearly 40% of the women are suceptable to cancer.
#Lets hope things will change with medical advancements.''',{'font':'serif','size':12, 'weight':'normal'}, alpha = 0.9)
#fig.text(0.75,0.50, "Cancerous",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[1]})
#fig.text(0.85,0.95, '|',{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[3]})
fig.text(0.85,0.70, "Healthy",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.85,0.50, "Cancerous",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[1]})
fig.text(0.82,0.1,'© Made by Milon',{'fontfamily':'serif', 'size':8,'weight':'bold'}, alpha = 0.7)
fig.show()
fig,ax = plt.subplots(nrows = 10, ncols = 3, figsize = (12,24),dpi=80)
#fig.patch.set_facecolor(colors[-1])
axes = ax.ravel()
for col,ax in zip(feat_df.columns,axes):
# skewness and kurtosis
if skew(feat_df[col])>1:
color = colors[0]
else:
color = colors[1]
## plots
#sns.kdeplot(feat_df[col], ax= ax, fill = True , color = color, alpha = 1, linewidth = 3, ec = 'black')
sns.violinplot(feat_df[col], ax =ax,
color = color, cut =0,
inner = 'box',
alpha = 1,linewidth = 3, edgecolor = 'solidblack', saturation =1 )
## plot setting
xlabel = ' '.join([value.capitalize() for value in str(col).split('_') ])
#ax.set_facecolor(colors[-1])
ax.axes.get_yaxis().set_visible(False)
ax.axes.set_xlabel(xlabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1)
plt.tight_layout(pad= 3,h_pad = 2.5, w_pad = 2.5)
## titles and text
#fig.text(0,1.05,'Women and Cancer: Overview of Univariate Feature Distribution', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.02,'''Are there any normally distributed features? It seems most of the features
#are skewed and having high kurtosis, may be a log somekind transformation needed. It seems
#most of the se features and fractual Dimensions have outliers. ''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 1)
fig.text(0.65,1, "Skewed",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]})
fig.text(0.73,1, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
fig.text(0.74,1, "Relative Normal",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[1]})
fig.text(0.73,0,'© Made by bhuvanchennoju/Kaggle',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)
fig.show()
fig,ax = plt.subplots(nrows = 10, ncols = 3, figsize = (12,24),dpi=80)
#fig.patch.set_facecolor(colors[-1])
axes = ax.ravel()
for col,ax in zip(feat_df.columns,axes):
## plots
sns.kdeplot(df[col], ax = ax, shade = True ,
palette = [colors[0], colors[2]],
alpha = 0.95, linewidth = 3, ec = 'black',
hue = df['diagnosis'], hue_order = [1,0],
legend = False)
## plot setting
xlabel = ' '.join([value.capitalize() for value in str(col).split('_') ])
#ax.set_facecolor(colors[-1])
ax.axes.get_yaxis().set_visible(False)
ax.axes.set_xlabel(xlabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1)
plt.tight_layout(pad= 3,h_pad = 1.5, w_pad = 1.5)
## titles and text
#fig.text(0,1.03,'Women and Cancer: Distribution of Cancers cells on Feature level', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.01,'''It seems most of the features and targets have similar kind of ditribution, but few
#target distributions are morelike normal distribution.''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 1)
fig.text(0.615,1, "Cancerous",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]}, alpha = 1)
fig.text(0.73,1, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
fig.text(0.74,1, "Healthy",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[2]}, alpha = 1)
fig.text(0.73,0,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)
fig.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-28-2346bf32333a> in <cell line: 5>() 7 ## plots 8 ----> 9 sns.kdeplot(df[col], ax = ax, shade = True , 10 palette = [colors[0], colors[2]], 11 alpha = 0.95, linewidth = 3, ec = 'black', /usr/local/lib/python3.9/dist-packages/seaborn/distributions.py in kdeplot(data, x, y, hue, weights, palette, hue_order, hue_norm, color, fill, multiple, common_norm, common_grid, cumulative, bw_method, bw_adjust, warn_singular, log_scale, levels, thresh, gridsize, cut, clip, legend, cbar, cbar_ax, cbar_kws, ax, **kwargs) 1683 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 1684 -> 1685 p = _DistributionPlotter( 1686 data=data, 1687 variables=_DistributionPlotter.get_semantics(locals()), /usr/local/lib/python3.9/dist-packages/seaborn/distributions.py in __init__(self, data, variables) 111 ): 112 --> 113 super().__init__(data=data, variables=variables) 114 115 @property /usr/local/lib/python3.9/dist-packages/seaborn/_oldcore.py in __init__(self, data, variables) 638 # information for numeric axes would be information about log scales. 639 self._var_ordered = {"x": False, "y": False} # alt., used DefaultDict --> 640 self.assign_variables(data, variables) 641 642 for var, cls in self._semantic_mappings.items(): /usr/local/lib/python3.9/dist-packages/seaborn/_oldcore.py in assign_variables(self, data, variables) 694 if x is None and y is None: 695 self.input_format = "wide" --> 696 plot_data, variables = self._assign_variables_wideform( 697 data, **variables, 698 ) /usr/local/lib/python3.9/dist-packages/seaborn/_oldcore.py in _assign_variables_wideform(self, data, **kwargs) 743 err = f"The following variable{s} cannot be assigned with wide-form data: " 744 err += ", ".join(f"`{v}`" for v in assigned) --> 745 raise ValueError(err) 746 747 # Determine if the data object actually has any data in it ValueError: The following variable cannot be assigned with wide-form data: `hue`
print(color_class.BOLD_COLOR + '\nSegregating Features Based On Category....\n' + color_class.END)
### measurement and characteristics keyword lists
measure_keyword = ['radius','perimeter','area','concavity', 'concave points']
character_keyword = ['texture','smoothness','compactness','symmetry','fractal']
### mean, standard error, and worst measure feature lists
mean_measure, mean_character = ['diagnosis'],['diagnosis']
se_measure, se_character = ['diagnosis'],['diagnosis']
worst_measure,worst_character = ['diagnosis'],['diagnosis']
### requrired mean, standard errror,and worst measure feature creating loop
for col in feat_df.columns:
name_list = str(col).split('_')
if name_list[0] in measure_keyword:
if 'mean' in name_list:
mean_measure.append(col)
elif 'se' in name_list:
se_measure.append(col)
else:
worst_measure.append(col)
if name_list[0] in character_keyword:
if 'mean' in name_list:
mean_character.append(col)
elif 'se' in name_list:
se_character.append(col)
else:
worst_character.append(col)
###### descriptions and lists
print(color_class.BOLD + 'Done!' +color_class.END)
print(color_class.BOLD_COLOR + '\nSeperated Features are stored into lists:\n' +color_class.END)
print(color_class.BOLD_COLOR + 'Mean of Measurements: '+color_class.END \
+ color_class.BOLD +str(' , '.join(mean_measure[1:])) + color_class.END +'\n')
print(color_class.BOLD_COLOR + 'Mean of Characteristics: '+color_class.END \
+ color_class.BOLD +str(' , '.join(mean_character[1:])) + color_class.END +'\n')
print(color_class.BOLD_COLOR + 'Standard Error of Measurements: '+color_class.END \
+ color_class.BOLD +str(' , '.join(se_measure[1:])) + color_class.END +'\n')
print(color_class.BOLD_COLOR + 'Standard Error of Characteristics: '+color_class.END \
+ color_class.BOLD +str(' , '.join(se_character[1:])) + color_class.END +'\n')
print(color_class.BOLD_COLOR + 'Worst of Measurements: '+color_class.END \
+ color_class.BOLD +str(' , '.join(worst_measure[1:])) + color_class.END +'\n')
print(color_class.BOLD_COLOR + 'Worst of Characteristics: '+color_class.END \
+ color_class.BOLD +str(' , '.join(worst_character[1:])) + color_class.END +'\n')
Segregating Features Based On Category.... Done! Seperated Features are stored into lists: Mean of Measurements: radius_mean , perimeter_mean , area_mean , concavity_mean , concave points_mean Mean of Characteristics: texture_mean , smoothness_mean , compactness_mean , symmetry_mean , fractal_dimension_mean Standard Error of Measurements: radius_se , perimeter_se , area_se , concavity_se , concave points_se Standard Error of Characteristics: texture_se , smoothness_se , compactness_se , symmetry_se , fractal_dimension_se Worst of Measurements: radius_worst , perimeter_worst , area_worst , concavity_worst , concave points_worst Worst of Characteristics: texture_worst , smoothness_worst , compactness_worst , symmetry_worst , fractal_dimension_worst
print(color_class.BOLD_COLOR +'\nFinally helper function to visualize bivariate features....\n' + color_class.END)
### bivariate cross relations visualizations function
def cust_pairplot(df,var,title, diag_kind = 'kde',corner = True,sign = 'off'):
## plot
g = sns.pairplot(data = df[var],
hue= 'diagnosis',hue_order = [1,0],
height = 2.5,aspect = 1,
corner = True, diag_kind= diag_kind,
palette = [colors[0],colors[2]],
plot_kws = {'alpha':1, 'size' : 1, 'linewidth' : 0.5, 'ec':'black'},
diag_kws = {'alpha':0.95,'ec':'black','linewidth':3 });
### plot setting
g._legend.remove();
plt.gcf().patch.set_facecolor('white');
plt.gcf().patch.set_alpha(1)
plt.gcf().set_size_inches(12,12);
#plt.gcf().set_dpi(55);
for ax in plt.gcf().axes:
ax.set_facecolor('white')
for loc in ['left','right','top','bottom']:
ax.spines[loc].set_visible(False)
#ax.set_xticks(ticks = [])
#ax.set_yticks(ticks = [])
ax.set_xlabel(xlabel = ax.get_xlabel(), **{'fontfamily':'serif', 'size':12,'weight':'bold'}, alpha = 1)
ax.set_ylabel(ylabel = ax.get_ylabel(), **{'fontfamily':'serif', 'size':12,'weight':'bold'}, rotation = 90,alpha = 1)
### titles and descriptions
# plt.gcf().text(0.425,0.85, 'Women and Cancer:\n{}'.format(title),{'fontfamily':'serif', 'size':22.,'weight':'bold'}, alpha = 1)
# plt.gcf().text(0.425,0.8,'''This visualization shows the bivariate relations among \nthe {}.'''.format(title),{'fontfamily':'serif', 'size':14}, alpha = 1)
plt.gcf().text(0.44,0.75, "Cancerous",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':colors[0]}, alpha = 1)
plt.gcf().text(0.565,0.75, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
plt.gcf().text(0.575,0.75, "Healthy",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[2]}, alpha = 1)
## legend
if sign == 'on':
plt.gcf().text(0.75,-0.025,'© Made by Milon',{'fontfamily':'serif', 'size':12, 'weight':'bold'},alpha = 1)
plt.gca().margins(x =0)
plt.gcf().show();
Finally helper function to visualize bivariate features....
cust_pairplot(df, mean_character, 'Mean Characteristics of Cancer Cells', sign = 'on')
cust_pairplot(df, se_measure, 'SE in Measurements of Cancer Cells',sign = 'on')
cust_pairplot(df, se_character, 'SE in Characteristics of Cancer Cells',sign = 'on')
cust_pairplot(df, se_character, 'SE in Characteristics of Cancer Cells',sign = 'on')
cust_pairplot(df, worst_character, 'Worst of Characteristics of Cancer Cells', sign = 'on')
print(color_class.BOLD_COLOR + '\nGetting High Positively Correatated and Negatively Coorealated Features \nfrom cross categorical feature extraction..\n' + color_class.END)
temp_df = df.corr().unstack().reset_index()
### cross relational positive features
positive_corr_df = (temp_df[(temp_df[0]>0.9) &
(temp_df['level_0'] != temp_df['level_1']) &
((temp_df['level_0'].apply(lambda x: str(x).split('_')[-1])) != (temp_df['level_1'].apply(lambda x: str(x).split('_')[-1])))])
positive_corr_df['z'] = positive_corr_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
positive_corr_df.drop_duplicates(subset="z", keep="first" , inplace = True )
positive_corr_df.drop(columns = ['z'], inplace = True)
### cross relational negative features
negative_corr_df = (temp_df[(temp_df[0]<-0.2) &
(temp_df['level_0'] != temp_df['level_1']) &
((temp_df['level_0'].apply(lambda x: str(x).split('_')[-1])) != (temp_df['level_1'].apply(lambda x: str(x).split('_')[-1])))])
negative_corr_df['z'] = negative_corr_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
negative_corr_df.drop_duplicates(subset="z", keep="first" , inplace = True )
negative_corr_df.drop(columns = ['z'], inplace = True)
print(color_class.BOLD +'Done!')
Getting High Positively Correatated and Negatively Coorealated Features from cross categorical feature extraction.. Done!
print(color_class.BOLD_COLOR + '\nHelper function to visualize the cross categorical Feature analysis\n'+ color_class.END)
def plot_cross_scatter(corr_df, data =df,title = None,des = None,nrows = 4, ncols = 3, figsize = (12,24), colors = colors):
col1_list = corr_df['level_0'].values.tolist()
col2_list = corr_df['level_1'].values.tolist()
## plotting
fig,axes = plt.subplots(nrows,ncols, figsize = (15,20))
# removing the last axes
axes.ravel()[-1].axes.get_xaxis().set_visible(False)
axes.ravel()[-1].axes.get_yaxis().set_visible(False)
for ax,col1,col2 in zip(axes.ravel(), col1_list,col2_list):
sns.scatterplot(x= data[col1], y = data[col2], ax = ax,size = 100,
linewidth= 0.5, edgecolor = 'black',
hue = data['diagnosis'], hue_order = [1,0],
palette = [colors[0],colors[2]], legend = False )
## plot setting
xlabel = ' '.join([value.capitalize() for value in str(col1).split('_') ])
ylabel = ' '.join([value.capitalize() for value in str(col2).split('_') ])
ax.axes.set_xlabel(xlabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1)
ax.axes.set_ylabel(ylabel,{'fontfamily':'serif','size':14, 'weight':'bold'}, alpha = 1)
ax.set_xticklabels('')
ax.set_yticklabels('')
## titles and text
#fig.text(0.05,0.935,'Women and Cancer: {}'.format(title), {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0.05,0.91,'''{}'''.format(des),{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 1)
fig.text(0.63,0.885, "Cancerous",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]}, alpha = 1)
fig.text(0.735,0.885, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
fig.text(0.745,0.885, "Healthy",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[2]}, alpha = 1)
fig.text(0.73,0.1,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)
fig.show()
return None
Helper function to visualize the cross categorical Feature analysis
des = 'Here we are seeing the cancer cell features which are highly correlated with each other and belong to different category. \nIt seems we have multi-colinear features and they are passing similar information, and This could alter the predictions.'
plot_cross_scatter(positive_corr_df, title = 'CrossCategorical Positively Related Features', des = des)
des = 'Here we are seeing the cancer cell features which are moderately correlated with each other and belong to different category. \nIt seems we have multi-colinear features and they are passing similar information, and This could alter the predictions.'
plot_cross_scatter(negative_corr_df,nrows = 4,ncols = 2, figsize=(12,6)
,title = 'CrossCategorical Negitively Correlated Features', des = des)
temp = df.copy()
X_temp = temp.drop(columns = ['id','diagnosis'])
y_temp = temp['diagnosis']
# fitting on umap
umap = UMAP(random_state=2021)
model_umap = umap.fit_transform(X_temp, y_temp)
fig,ax = plt.subplots(figsize=(7,7),dpi =80)
# plots
ax.scatter(model_umap[temp['diagnosis'] == 0][:,0], model_umap[temp['diagnosis'] == 0][:,1], c= colors[2], alpha=1,s=50, linewidth = 1, ec = 'black')
ax.scatter(model_umap[temp['diagnosis'] == 1][:,0], model_umap[temp['diagnosis'] == 1][:,1], c= colors[0], alpha=1,s=50, linewidth = 1, ec = 'black')
## titles and text
ax.set_xticklabels('')
ax.set_yticklabels('')
fig.text(0,1.01,' Dimensionality Reduction with UMAP', {'fontfamily':'serif','size':18, 'weight':'bold'}, alpha = 1)
#fig.text(0,0.95,'''Wow! As data is very less clear clustering of cancer cells can
#be seen. There are clearly seperable and hope get good results...''',{'font':'serif','size':13, 'weight':'normal'}, alpha = 0.95)
fig.text(0.68,0.85, "Cancerous",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.85,0.85, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.87,0.85, "Healthy",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.65,0.05,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)
fig.show()
print(color_class.BOLD_COLOR + '\nOutlier Removal,Skewness, Kurtosis helper funtions are here....\n'+ color_class.END)
def outlier_detect(algo,data):
cols = data.drop(columns = ['id']).columns
# creating feature and target numpy arrays
feat, tar = data[cols].drop(columns = 'diagnosis').values, data['diagnosis'].values
# fitting the features to algo
yhat = algo.fit_predict(feat)
# masking the features that are not outliers
mask = yhat != -1
X,y = feat[mask,:], tar[mask]
data_inarray = np.append(y.reshape(-1,1),X,axis = 1)
return pd.DataFrame(data = data_inarray, columns = cols)
def skew_sum(data):
return skew(data).sum()
def kurtosis_sum(data):
return kurtosis(data).sum()
def shape(data):
return data.shape
Outlier Removal,Skewness, Kurtosis helper funtions are here....
print(color_class.BOLD_COLOR+'\nOutliers related information with isolation forest, elliptic envelope, localoutlierfactor, dbscan is storing to dataframes....' + color_class.END)
outlier_algos = [IsolationForest(contamination = 0.05),\
EllipticEnvelope(contamination = 0.05),\
LocalOutlierFactor(contamination = 0.05), \
DBSCAN(eps = 70, min_samples = 10)]
df_list = [df.drop(columns = ['id'])]
shapes = [df.drop(columns = ['id']).shape[0]]
skews = [skew(df.drop(columns = ['id']))]
kurts = [kurtosis(df.drop(columns = ['id']))]
for algo in outlier_algos:
corrected_df = outlier_detect(algo, df)
df_list.append(corrected_df)
shapes.append(corrected_df.shape[0])
skews.append(skew(corrected_df))
kurts.append(kurtosis(corrected_df))
algorithms = ['Original','IsolationForest', 'EllipticEnvelope', 'LocalOutlierFactor', 'DBSCAN']
outliers_info = pd.DataFrame({'algorithms':algorithms,'df_list':df_list,'shapes':shapes, 'skews':skews, 'kurts': kurts})
outliers_info['skews_sum'] = outliers_info['skews'].apply(lambda x: round(x.sum(),2))
outliers_info['kurts_sum'] = outliers_info['kurts'].apply(lambda x: round(x.sum(),2))
outliers_info.sort_values(by = 'shapes').reset_index(drop = True, inplace = True)
for idx, df_ in enumerate(outliers_info['df_list']):
from sklearn.metrics import f1_score
lr = LinearRegression()
X = df_.drop(columns = ['diagnosis'])
y = df_['diagnosis']
xtrain, xtest,ytrain,ytest = train_test_split(X,y,test_size = 0.2)
# linear regression
preds = LinearRegression().fit(xtrain.values,ytrain.values).predict(xtest.values)
r2 = round(r2_score(ytest,preds),3)
#ypred_class = preds > 0.85
#acc = round(accuracy_score(ytest, ypred_class),3)
#roc_auc = round(roc_auc_score(ytest,ypred_class),3)
metric_list = r2
outliers_info.loc[idx, 'r2_score'] = metric_list
print(color_class.BOLD + '\nAll the corrected data stored to' + color_class.END \
+ color_class.BOLD_COLOR + str(' Outliers_info DataFrame\n')+ color_class.END)
print(color_class.BOLD)
print(outliers_info.T)
Outliers related information with isolation forest, elliptic envelope, localoutlierfactor, dbscan is storing to dataframes.... All the corrected data stored to Outliers_info DataFrame 0 \ algorithms Original df_list diagnosis radius_mean texture_mean per... shapes 569 skews [0.5270671676029054, 0.9398934455576345, 0.648... kurts [-1.722200200835051, 0.8275836739140465, 0.741... skews_sum 52.6 kurts_sum 230.35 r2_score 0.761 1 \ algorithms IsolationForest df_list diagnosis radius_mean texture_mean per... shapes 540 skews [0.6375196739103915, 0.6679465471669559, 0.708... kurts [-1.593568665377198, -0.08226351117536046, 0.9... skews_sum 34.3 kurts_sum 61.37 r2_score 0.749 2 \ algorithms EllipticEnvelope df_list diagnosis radius_mean texture_mean per... shapes 540 skews [0.6461002842069277, 0.7240168357232661, 0.672... kurts [-1.5825544227477208, 0.12872870717619955, 0.8... skews_sum 37.5 kurts_sum 75.98 r2_score 0.72 3 \ algorithms LocalOutlierFactor df_list diagnosis radius_mean texture_mean per... shapes 540 skews [0.6461002842069274, 0.6493382869646654, 0.674... kurts [-1.5825544227477213, -0.23318349604508004, 0.... skews_sum 43.57 kurts_sum 143.32 r2_score 0.773 4 algorithms DBSCAN df_list diagnosis radius_mean texture_mean per... shapes 443 skews [1.492963669774385, 0.4507097379177925, 0.7420... kurts [0.22894051926616887, 0.3695249571409436, 0.62... skews_sum 48.79 kurts_sum 176.44 r2_score 0.618
print(color_class.BOLD_COLOR +'\nHelper class to make the outlier and original data comparisions....\n'+color_class.END)
class outlier_viz():
def __init__(self,ax,orig_feat = None,corrected_feat = None):
self.x_org = orig_feat
self.x_corr = corrected_feat
self.ax = ax
def visualize_data(self,name =None, r2 = None,orig_r2 = None):
self.ax.set_facecolor('white')
# dimension reduction
pca1 = PCA(n_components= 2).fit_transform(self.x_org)
pca2 = PCA(n_components= 2).fit_transform(self.x_corr)
self.ax.scatter(pca1[:,0], pca1[:,1], c = colors[0], s = 50, zorder =0, alpha = 1, linewidth = 1, ec = 'black')
self.ax.scatter(pca2[:,0], pca2[:,1], c = colors[1], s = 50, zorder = 3, alpha = 1,linewidth = 1, ec = 'black')
self.ax.text(3000,900,'{}'.format(name), {'fontfamily':'serif','size':14,'weight':'bold','color':'black'},alpha= 0.9)
self.ax.text(3000,800,'R2 Score: {}'.format(r2), {'fontfamily':'serif','size':14,'weight':'bold','color':'black'},alpha= 0.9)
self.ax.text(3000,700,'Orig R2 Score: {}'.format(orig_r2), {'fontfamily':'serif','size':14,'weight':'bold','color':'black'},alpha= 0.9)
Helper class to make the outlier and original data comparisions....
fig, ax =plt.subplots(2,2,figsize =(13,9), dpi = 70)
axes = ax.ravel()
for ax in axes:
ax.set_xticklabels('')
ax.set_yticklabels('')
# plotting
orig = outliers_info['df_list'][0]
(outlier_viz(ax = axes[0] , orig_feat = orig, corrected_feat= outliers_info['df_list'][1])
.visualize_data(name = 'Isolation Forest', r2= outliers_info['r2_score'][1],orig_r2 = outliers_info['r2_score'][0]))
(outlier_viz(ax = axes[1], orig_feat = orig, corrected_feat= outliers_info['df_list'][2])
.visualize_data(name = 'Eclliptic Envelope',r2= outliers_info['r2_score'][2],orig_r2 = outliers_info['r2_score'][0]))
(outlier_viz(ax = axes[2], orig_feat = orig, corrected_feat= outliers_info['df_list'][3])
.visualize_data(name = 'Local Outlier Factor',r2= outliers_info['r2_score'][3],orig_r2 = outliers_info['r2_score'][0]))
(outlier_viz(ax = axes[3], orig_feat = orig, corrected_feat= outliers_info['df_list'][4])
.visualize_data(name = 'DBSCAN',r2= outliers_info['r2_score'][4],orig_r2 = outliers_info['r2_score'][0]))
# text and labels
### title and annotations
## titles and text
#fig.text(-0.05,1.085,'Women and Cancer: Outliers and Original Data', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(-0.05,1.0,'''Looks like evey outlier detection algorithm did a good job, butit is not possible to
#select the best one out of them, without looking at skew and kurtosis values,
#lets dive into that next...''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 1)
#0.59
#0.73
#0.74
fig.text(0.29,1, "Original Data",{'fontfamily':'serif','size':22, 'weight':'bold', 'color':colors[0]}, alpha = 1)
fig.text(0.49,1, '|',{'fontfamily':'serif','size':22, 'weight':'bold'})
fig.text(0.54,1, "Corrected Data",{'fontfamily':'serif','size':22, 'weight':'bold','color':colors[1]}, alpha = 1)
fig.text(0.7,-0.01,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)
fig.tight_layout(pad = 1.5, w_pad = 1.5,h_pad = 1.5)
fig.show()
fig,ax = plt.subplots(1,2, figsize =(12,6), dpi = 100)
axes = ax.ravel()
# total skew plot
for idx in range(1,outliers_info.shape[0]):
axes[0].barh(width = outliers_info['skews_sum'][0],
y = outliers_info['algorithms'][idx], color = colors[0])
axes[0].barh(width = outliers_info['skews_sum'][1:],
y = outliers_info['algorithms'][1:], color = colors[1])
# total kurtosis plot
for idx in range(1,outliers_info.shape[0]):
axes[1].barh(width = outliers_info['kurts_sum'][0],
y = outliers_info['algorithms'][idx], color = colors[0])
axes[1].barh(width = outliers_info['kurts_sum'][1:],
y = outliers_info['algorithms'][1:], color = colors[2])
# plot ticks and title setting
axes[1].tick_params(axis = 'y',pad = 95)
axes[1].set_yticklabels(outliers_info['algorithms'][1:], {'fontfamily':'serif','size':16,'weight':'bold'}, ha = 'center')
axes[1].set_xticklabels('')
axes[0].set_xticklabels('')
axes[0].set_yticklabels('')
axes[0].invert_xaxis()
## text and decorations
# skewness annotations
for pa in axes[0].patches:
axes[0].text(pa.get_width(), pa.get_y()+ pa.get_height()/2, int(pa.get_width()),
{'color':'black','fontfamily':'serif','weight':'bold','size':'12'},alpha= 1, va = 'center')
if pa in axes[0].patches[4:]:
change = int((int(outliers_info['skews_sum'][0]) - int(pa.get_width())) / int(outliers_info['skews_sum'][0]) *100)
axes[0].text(pa.get_width()*0, pa.get_y(),'{}% {}'.format(change,u'\u2193'),
{'color':'black','fontfamily':'serif','weight':'bold','size':14},
alpha= 0.8,va = 'bottom', ha='right')
# kurtosis annotations
for pa in axes[1].patches:
axes[1].text(pa.get_width()-20, pa.get_y()+ pa.get_height()/2, int(pa.get_width()),
{'color':'black','fontfamily':'serif','weight':'bold','size':'12'},alpha= 1, va = 'center')
if pa in axes[1].patches[4:]:
change = int((int(outliers_info['kurts_sum'][0]) - int(pa.get_width())) / int(outliers_info['kurts_sum'][0]) *100)
axes[1].text(pa.get_width()*0, pa.get_y(),'{}% {}'.format(change,u'\u2193'),
{'color':'black','fontfamily':'serif','weight':'bold','size':14},alpha= 0.8, va = 'bottom')
### title and annotations
## titles and text
#fig.text(-0.05,1.18,' Comparision of Total Skews and Kurtosis', {'font':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(-0.05,1.07,'''Total Skews and Total kurtosis means sum of skews,and sum of kurtosis of all features
#respectively. It seems with default setting of 5% points as outliers, Isolation forest
#did well in reducing both skew and kurtosis of data.''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 0.95)
fig.text(0.27,0.99, "Skewness",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':colors[1]})
fig.text(0.40,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.45,0.99, "Original",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[0]})
fig.text(0.60,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.62,0.99, "Kurtosis",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[2]})
fig.text(0.82,0.0,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)
plt.tight_layout(pad = 1, h_pad = 1, w_pad = 1)
fig.show()
fig, ax = plt.subplots(1,2,figsize = (12,12))
axes = ax.ravel()
axes[0].invert_xaxis()
axes[0].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['skews'][0].tolist(), color = colors[0],align='center')
axes[0].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['skews'][1].tolist(), color = colors[1],align='center')
axes[1].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['kurts'][0].tolist(), color = colors[0],align='center')
axes[1].barh(y = df.drop(columns = ['id']).columns, width=outliers_info['kurts'][1].tolist(), color = colors[2],align='center')
axes[0].set_yticklabels('')
axes[1].set_yticklabels(df.drop(columns = ['id']).columns, {'fontfamily':'serif','size':12,'weight':'bold'},rotation = 0,ha= 'center')
axes[1].tick_params(axis = 'y',pad = 75)
axes[0].set_xticklabels('')
axes[1].set_xticklabels('')
### title and annaotations
### title and annotations
## titles and text
#fig.text(0,1.09,' Isolation Forest Feature Level Stats ', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.05,'''We go to know that Isolation forest does a good job with outlier detection with 10% contamination
#and this butterfly plot shows the feature level change in skewness and kurosis values.''',{'font':'serif','size':14, 'weight':'normal'}, alpha = 0.9)
fig.text(0.27,0.99, "Skewness",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':colors[1]})
fig.text(0.40,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.45,0.99, "Original",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[0]})
fig.text(0.60,0.99, '|',{'fontfamily':'serif','size':18, 'weight':'bold'})
fig.text(0.62,0.99, "Kurtosis",{'fontfamily':'serif','size':18, 'weight':'bold','color':colors[2]})
fig.text(0.75,0.05,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)
plt.tight_layout(pad = 1, h_pad = 1, w_pad = 1)
fig.show()
print(color_class.BOLD_COLOR + '\nSelecting Data without Outliers with Isolation Forest with contamination of 10%\n' + color_class.END)
# selecting df with outliers removed
df = outliers_info['df_list'][1]
print(color_class.BOLD + 'Done!\n')
print('Shape of Original Data: '+\
color_class.BOLD_COLOR+ str(outliers_info['df_list'][0].shape) + color_class.END)
print(color_class.BOLD + 'Shape of Corrected Data: ' + color_class.END+\
color_class.BOLD_COLOR+ str(df.shape) + color_class.END)
print(color_class.BOLD_COLOR+ '\nAll set for feature selection...\n' + color_class.END)
Selecting Data without Outliers with Isolation Forest with contamination of 10% Done! Shape of Original Data: (569, 31) Shape of Corrected Data: (540, 31) All set for feature selection...
print(color_class.BOLD_COLOR+'\nCutom Correlation matrix values extraction...\n'+color_class.END)
## correlation matrix customization
corr_df = df.corr()
temp_df = corr_df.stack().reset_index()
temp_df = temp_df[temp_df[0] != 1.0].reset_index(drop = True)
temp_df['z'] = temp_df.apply(lambda x: tuple(sorted([x['level_0'],x['level_1']])), axis = 1)
temp_df.drop_duplicates(subset="z", keep="first" , inplace = True )
temp_df.drop(columns = ['z'], inplace = True)
temp_df.reset_index(drop = True,inplace = True)
temp_df['color'] = temp_df[0].apply(lambda x: colors[1] if x <0.25 else ( colors[2] if ((x > 0.25) & (x<0.85)) else colors[0]))
print(color_class.BOLD +'Correation Matrix data ready for custom visualization...\n')
print(color_class.BOLD)
print(temp_df.head(2))
Cutom Correlation matrix values extraction... Correation Matrix data ready for custom visualization... level_0 level_1 0 color 0 diagnosis radius_mean 0.735593 #eda760 1 diagnosis texture_mean 0.394614 #eda760
## custom heatmap for correlation matrix
fig, ax = plt.subplots(figsize = (10,6), dpi = 85)
# flipping yaxis
ax.invert_yaxis()
## Creating dop representational plot
ax.scatter(x = temp_df['level_0'], y = temp_df['level_1'],
s = temp_df[0]*100, c = temp_df['color'], linewidth = 1, edgecolor = 'black')
## plot setting - ticks and labels
x_vals = temp_df['level_0'].value_counts()
y_vals = temp_df['level_1'].value_counts().sort_values()
xticklabels = [ ' '.join((str(col).capitalize()).split('_')) for col in x_vals.index]
yticklabels = [ ' '.join((str(col).capitalize()).split('_')) for col in y_vals.index]
#xticklabels.reverse()
#for x,y,label in zip(x_vals.values, x_vals.values,xticklabels):
# ax.text(y-1,x-1.5,label,{'font':'serif','size':10,'weight':'bold','color':'black'},rotation = 90, ha = 'center',va = 'bottom', alpha = 0.75)
ax.set_yticklabels(yticklabels, {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75)
ax.set_xticklabels(xticklabels, {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75,rotation = 90)
## titles and desc
## titles and text
fig.text(-0.05,0.98,' Correlation Matrix and Multi-colinearity', {'fontfamily':'serif','size':20, 'weight':'bold'}, alpha = 1)
fig.text(-0.05,0.92,'''Features could be highly correlated, moderately correalated, and least correlated based on color scheme.
Blanks spaces indicate negative correlations. Multicolinearity exits in data.''',{'fontfamily':'serif','size':12, 'weight':'normal'}, alpha = 0.9)
fig.text(0.37,0.75, "High",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.45,0.75, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.48,0.75, "Moderate",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.62,0.75, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.65,0.75, "Least",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[1]})
fig.text(0.65,-0.2,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.7)
fig.show()
print(color_class.BOLD_COLOR +'\nVariance inflation factor algorithm in a nutshell...\n'+color_class.END)
def VIF(data):
vif_list = list()
for col in data.columns:
X = data.drop(columns = [col])
y = data[col]
model = LinearRegression().fit(X.values,y.values)
ypreds = model.predict(X.values)
r2 = r2_score(y.values,ypreds)
VIF = 1 /(1-r2)
vif_list.append(VIF)
return vif_list
Variance inflation factor algorithm in a nutshell...
print(color_class.BOLD_COLOR + '\nVariance Inflation factor implementation for feature selection.... \n'+ color_class.END)
VIF_max = 1000
VIF_dfs = {}
n = 0
while int(VIF_max) > 10:
try:
data = data.drop(columns = [temp['Features'][0]])
temp = (pd.DataFrame({'Features':data.columns,'VIF':VIF(data)})
.sort_values(by = 'VIF', ascending = False).reset_index(drop = True))
except:
data = df.drop(columns = ['diagnosis'])
temp = (pd.DataFrame({'Features':df.drop(columns = ['diagnosis']).columns,'VIF':VIF(data)})
.sort_values(by = 'VIF', ascending = False).reset_index(drop = True))
VIF_max = temp['VIF'][0]
VIF_dfs['iter_{}'.format(n)] = temp
n+=1
del temp
print(color_class.BOLD + '\nCalcuations are finished! feature and corresponding VIF are stored in VIF_dfs list\n'+color_class.END)
print(color_class.BOLD_COLOR + 'Final Features In Data and Final VIFs...'+ color_class.END)
print(color_class.BOLD)
print(VIF_dfs['iter_{}'.format(n-1)])
Variance Inflation factor implementation for feature selection.... Calcuations are finished! feature and corresponding VIF are stored in VIF_dfs list Final Features In Data and Final VIFs... Features VIF 0 smoothness_worst 9.885053 1 fractal_dimension_worst 9.021375 2 fractal_dimension_mean 7.652832 3 compactness_se 7.396592 4 symmetry_worst 6.632449 5 smoothness_mean 6.414340 6 fractal_dimension_se 6.394734 7 concavity_se 5.143290 8 smoothness_se 4.324478 9 concave points_se 4.114929 10 area_mean 4.072539 11 symmetry_se 3.930165 12 symmetry_mean 3.351743 13 radius_se 2.867922 14 texture_se 2.011996 15 texture_mean 1.688786
print(color_class.BOLD_COLOR + '\nMerging VIF iteration history dataframes to understand how algoirthm worked....\n'+ color_class.END)
### merging all the dataframes from VIF feature selection implementation
for key,value in VIF_dfs.items():
if key == 'iter_0':
base = value
else:
base = pd.merge(left = base, right = value, on = 'Features', how = 'outer')
base = base
VIF_matrix = base.fillna(0).set_index('Features',drop = True)
del base
VIF_matrix.columns = VIF_dfs.keys()
## write up
print(color_class.BOLD + '\nDataframes merged and stored data into' + color_class.END + color_class.BOLD_COLOR+ ' VIF_matrix' + color_class.END)
print(color_class.BOLD_COLOR+ '\nExtracting data for custom visualization....\n' + color_class.END)
### Extracting data for custom visualization
temp_df = VIF_matrix.apply(lambda x: x/x.max(), axis = 0).stack().reset_index() # noralizing values and stacking
## there wont be any duplicates, just to make sure do this drop
temp_df['z'] = temp_df.apply(lambda x: tuple(sorted([x['Features'],x['level_1']])), axis = 1)
temp_df.drop_duplicates(subset="z", keep="first" , inplace = True )
temp_df.drop(columns = ['z'], inplace = True)
temp_df.reset_index(drop = True,inplace = True)
temp_df['color'] = temp_df[0].apply(lambda x: colors[2] if x <1 else colors[0])
## write up
print(color_class.BOLD + '\nDone!!' + color_class.END)
Merging VIF iteration history dataframes to understand how algoirthm worked.... Dataframes merged and stored data into VIF_matrix Extracting data for custom visualization.... Done!!
## custom heatmap for correlation matrix
fig, ax = plt.subplots(figsize = (10,6), dpi = 85)
# flipping yaxis
ax.invert_yaxis()
## Creating dop representational plot
ax.scatter(y = temp_df['Features'], x = temp_df['level_1'],
s = temp_df[0]*120, c = temp_df['color'], linewidth = 1, edgecolor = 'black')
ax.axvspan(xmin = 13.5, xmax = 14.5, color = colors[1], alpha = 0.25,zorder = 0)
## plot setting - ticks and labels
y_vals = temp_df['Features'].unique()
x_vals = temp_df['level_1'].unique()
xticklabels = [ ' '.join((str(col).capitalize()).split('_')) for col in x_vals]
yticklabels = [ ' '.join((str(col).capitalize()).split('_')) for col in y_vals]
#xticklabels.reverse()
#for x,y,label in zip(x_vals.values, x_vals.values,xticklabels):
# ax.text(y-1,x-1.5,label,{'font':'serif','size':10,'weight':'bold','color':'black'},rotation = 90, ha = 'center',va = 'bottom', alpha = 0.75)
ax.set_yticklabels(yticklabels, {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75)
ax.set_xticklabels(xticklabels, {'fontfamily':'serif','size':10,'weight':'bold','color':'black'}, alpha = 0.75,rotation = 90)
ax.annotate('This Feature drops \nin next iteration', xy=(0.,0), xytext=(2, -1),
arrowprops=dict(facecolor='white',arrowstyle="->",
connectionstyle="arc3,rad=.5",color='black',linewidth=0.8, alpha = 0.7),
#bbox = dict(boxstyle ="round", fc ="white", pad =0.25,color = 'darkorange'),
fontsize=8,fontfamily='serif',fontweight ='bold',ha='center', color='black', zorder = 3,
annotation_clip = False, alpha = 0.85)
ax.annotate('Final Features wrt VIF Feature Removal', xy=(14.75,25), xytext=(14.75,25),
fontsize=8,fontfamily='serif',fontweight ='bold',ha='center', color='black', zorder = 3,
annotation_clip = False, alpha = 0.85, rotation = 90)
## titles and desc
## titles and text
fig.text(-0.05,0.98,' Normalized VIF of Features with Iterations', {'fontfamily':'serif','size':20, 'weight':'bold'}, alpha = 1)
fig.text(-0.05,0.92,'''VIF are calculated for each feature, and removed the highest VIF feature (Purple) for next iteration.
Clearly all the highly correlated features are removed''',{'fontfamily':'serif','size':12, 'weight':'normal'}, alpha = 0.9)
fig.text(0.5,0.90, "Highest VIF",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.65,0.90, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.68,0.90, "Moderate VIF",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.65,-0.05,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.7)
fig.show()
X_temp = df.drop(columns = ['diagnosis'])
y_temp = df['diagnosis']
temp_X_train,temp_X_val,temp_y_train,temp_y_val = train_test_split(X_temp,y_temp, test_size = 0.2, random_state = 2021)
temp_model= XGBClassifier(eval_metric='logloss').fit(temp_X_train,temp_y_train)
perm = PermutationImportance(temp_model, scoring = 'r2').fit(temp_X_val,temp_y_val)
eli5_feature_importance1 = (pd.DataFrame({'Features':temp_X_train.columns.tolist(),'Importance':perm.feature_importances_})
.sort_values(by = 'Importance'))
perm_imp_feats1 = (eli5_feature_importance1.sort_values(by = 'Importance', ascending = False)
.reset_index(drop = True))['Features'][0:15]
print(color_class.BOLD_COLOR + 'Feature Importance with r2 metric same as VIF....')
eli5.show_weights(perm, feature_names = temp_X_val.columns.tolist())
Feature Importance with r2 metric same as VIF....
| Weight | Feature |
|---|---|
| 0.0930 ± 0.0986 | area_worst |
| 0.0761 ± 0.0986 | texture_worst |
| 0.0507 ± 0.0633 | texture_mean |
| 0.0338 ± 0.1353 | concave points_worst |
| 0.0338 ± 0.0338 | radius_worst |
| 0.0169 ± 0.0414 | area_se |
| 0.0169 ± 0.0676 | fractal_dimension_se |
| 0 ± 0.0000 | area_mean |
| 0 ± 0.0000 | compactness_mean |
| 0 ± 0.0000 | symmetry_mean |
| 0 ± 0.0000 | fractal_dimension_mean |
| 0 ± 0.0000 | smoothness_se |
| 0 ± 0.0000 | radius_se |
| 0 ± 0.0000 | texture_se |
| 0 ± 0.0000 | perimeter_se |
| 0 ± 0.0000 | radius_mean |
| 0 ± 0.0000 | concavity_se |
| 0 ± 0.0000 | symmetry_worst |
| 0 ± 0.0000 | perimeter_mean |
| 0 ± 0.0000 | fractal_dimension_worst |
| … 10 more … | |
print(color_class.BOLD_COLOR + '\nFitting train data on linear regression to get accuracy,r2 ,and roc_aoc scores...\n' + color_class.END)
vif_features = VIF_dfs['iter_14']['Features'].values.tolist()
def test_linear_features():
X_orig = df.drop(columns = ['diagnosis'])
y_orig = df['diagnosis']
X_train,X_val,y_train,y_val = train_test_split(X_orig,y_orig, test_size = 0.2, random_state = 2021)
# linear regression
orig_preds = LinearRegression().fit(X_train.values,y_train.values).predict(X_val.values)
vif_preds = LinearRegression().fit(X_train[vif_features].values,y_train.values).predict(X_val[vif_features].values)
perm_preds = LinearRegression().fit(X_train[perm_imp_feats1].values,y_train.values).predict(X_val[perm_imp_feats1].values)
orig_r2 = round(r2_score(y_val, orig_preds),3)
vif_r2 = round(r2_score(y_val, vif_preds),3)
perm_r2 = round(r2_score(y_val, perm_preds),3)
from sklearn.metrics import f1_score
orig_ypred_class = orig_preds > 0.85
vif_ypred_class = vif_preds > 0.85
perm_ypred_class = perm_preds > 0.85
orig_auc = round(accuracy_score(y_val, orig_ypred_class),3)
vif_auc = round(accuracy_score(y_val, vif_ypred_class),3)
perm_auc = round(accuracy_score(y_val, perm_ypred_class),3)
orig_roc_auc = round(roc_auc_score(y_val, orig_preds),3)
vif_roc_auc = round(roc_auc_score(y_val, vif_preds),3)
perm_roc_auc = round(roc_auc_score(y_val, perm_preds),3)
orig_list = [orig_auc,orig_r2,orig_roc_auc]
vif_list = [vif_auc, vif_r2,vif_roc_auc]
perm_list = [perm_auc,perm_r2,perm_roc_auc]
return orig_list,vif_list,perm_list
orig_list,vif_list, perm_list = test_linear_features()
print(color_class.BOLD + '\nAccuracy score, r2 score, and roc_auc score:\n ' + color_class.END)
print(color_class.BOLD + 'Orginal Features: ' + color_class.END + color_class.BOLD_COLOR + str(orig_list) + color_class.END + '\n')
print(color_class.BOLD + 'Permutation Feature Selection: ' + color_class.END + color_class.BOLD_COLOR + str(vif_list) + color_class.END + '\n')
print(color_class.BOLD + 'Variance Infaltion Factor based Feature Selection: ' + color_class.END + color_class.BOLD_COLOR + str(perm_list) + color_class.END + '\n')
Fitting train data on linear regression to get accuracy,r2 ,and roc_aoc scores... Accuracy score, r2 score, and roc_auc score: Orginal Features: [0.852, 0.714, 0.972] Permutation Feature Selection: [0.843, 0.717, 0.987] Variance Infaltion Factor based Feature Selection: [0.806, 0.703, 0.986]
print(color_class.BOLD_COLOR +'\nHelper function for the PCA visualization with diagnosis hue... \n'+ color_class.END)
class pca_viz():
def __init__(self,feat,tar,ax):
self.feat = feat
self.tar = tar
self.ax = ax
def visualize_data(self):
temp_y = pd.DataFrame({'y':self.tar})
pca = PCA(n_components= 2).fit_transform(self.feat)
# plotting
self.ax.scatter(pca[temp_y['y'] == 0][:,0], pca[temp_y['y'] == 0][:,1], c = colors[2], s = 50, linewidth =1, ec = 'black')
self.ax.scatter(pca[temp_y['y'] == 1][:,0], pca[temp_y['y'] == 1][:,1], c = colors[0], s = 50, linewidth =1,ec ='black')
#self.ax.set_xticklabels('')
#self.ax.set_yticklabels('')
Helper function for the PCA visualization with diagnosis hue...
fig = plt.figure(figsize =(14,14))
gs = fig.add_gridspec(10,10)
gs.update(wspace = 10,hspace = 2)
#ax0 = fig.add_subplot(gs[:,:])
ax1 = fig.add_subplot(gs[1:5, 0:5])
ax2 = fig.add_subplot(gs[1:5, 5:10])
ax3 = fig.add_subplot(gs[6:10, 2:8])
axes = [ax1,ax2,ax3]
data_ = df.copy()
## data with outlier removal
X_orig = data_.drop(columns = ['diagnosis']).values
X_perm = data_.drop(columns = ['diagnosis'])[perm_imp_feats1].values
y_orig = data_['diagnosis'].values
## data with VIF featue selection
X_vif = data_[vif_features].values
# plots
pca_viz(feat = X_perm,tar = y_orig,ax=axes[0]).visualize_data()
pca_viz(feat = X_vif, tar = y_orig,ax=axes[1]).visualize_data()
pca_viz(feat = X_orig,tar = y_orig,ax=axes[2]).visualize_data()
# text and title
## titles and text
axes[0].text(-50,570, 'Permutation Feature Selection',{'fontfamily':'serif','size':14, 'weight':'bold'}, zorder =3)
axes[1].text(-100,22.5, 'VIF Feature Selection',{'fontfamily':'serif','size':14, 'weight':'bold'})
axes[2].text(0,675, 'Original Features',{'fontfamily':'serif','size':14, 'weight':'bold'})
## add scores
#permutation annotations
axes[0].annotate('Acc: {}'.format(perm_list[0]),(1600,440),(1600,440), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[0].annotate(' R2: {}'.format(perm_list[1]),(1600,400),(1600,400), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[0].annotate('AUC: {}'.format(perm_list[2]),(1600,350),(1600,350), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
#VIF annotations
axes[1].annotate('Acc: {}'.format(vif_list[0]),(1000,18),(1000,18), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[1].annotate(' R2: {}'.format(vif_list[1]),(1000,16.5),(1000,16.5), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[1].annotate('AUC: {}'.format(vif_list[2]),(1000,15),(1000,15), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
# Original annotations
axes[2].annotate('Acc: {}'.format(orig_list[0]),(1900,510),(1900,510), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[2].annotate(' R2: {}'.format(orig_list[1]),(1900,470),(1900,470), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
axes[2].annotate('AUC: {}'.format(orig_list[2]),(1900,425),(1900,425), zorder =3, annotation_clip = False,
fontsize=12,fontfamily='serif',fontweight ='bold',ha='center', color='black',alpha = 0.85)
# text and titles
fig.text(0.05,0.95,' Linear model Perforamace with PFI,VIF,and Original Data', {'fontfamily':'serif','size':20, 'weight':'bold'}, alpha = 1)
fig.text(0.05,0.9,'''Though the feature selection is done based on r2 metric, for comparision
of accuracy,r2, and auc scores among PFI, VIF and Orginal data with LinearRegression.
VIF based feature selection should give edge here... ''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.9)
fig.text(0.40,0.88, "Cancerous",{'fontfamily':'serif','size':16, 'weight':'bold', 'color':colors[0]})
fig.text(0.52,0.88, '|',{'fontfamily':'serif','size':16, 'weight':'bold'})
fig.text(0.55,0.88, "Healthy",{'fontfamily':'serif','size':16, 'weight':'bold','color':colors[2]})
fig.text(0.75,0.075,'© Made by Milon',{'fontfamily':'serif', 'size':9,'weight':'bold'}, alpha = 0.8)
fig.show()
X_temp = df.drop(columns = ['diagnosis'])
y_temp = df['diagnosis']
temp_X_train,temp_X_val,temp_y_train,temp_y_val = train_test_split(X_temp,y_temp, test_size = 0.2, random_state = 2021)
temp_model= XGBClassifier(eval_metric='logloss').fit(temp_X_train,temp_y_train)
perm = PermutationImportance(temp_model, scoring = 'roc_auc').fit(temp_X_val,temp_y_val)
eli5_feature_importance2 = (pd.DataFrame({'Features':temp_X_train.columns.tolist(),'Importance':perm.feature_importances_})
.sort_values(by = 'Importance'))
perm_imp_feats_auc = (eli5_feature_importance2.sort_values(by = 'Importance', ascending = False)
.reset_index(drop = True))['Features']
print(color_class.BOLD_COLOR + 'Feature Importance with roc_auc metric....')
eli5.show_weights(perm, feature_names = temp_X_val.columns.tolist())
Feature Importance with roc_auc metric....
| Weight | Feature |
|---|---|
| 0.0298 ± 0.0369 | texture_worst |
| 0.0258 ± 0.0135 | area_worst |
| 0.0122 ± 0.0064 | area_se |
| 0.0091 ± 0.0137 | concave points_worst |
| 0.0081 ± 0.0042 | compactness_se |
| 0.0046 ± 0.0054 | concavity_worst |
| 0.0032 ± 0.0057 | symmetry_se |
| 0.0030 ± 0.0046 | perimeter_worst |
| 0.0027 ± 0.0060 | concave points_mean |
| 0.0024 ± 0.0019 | fractal_dimension_se |
| 0.0020 ± 0.0030 | concavity_se |
| 0.0020 ± 0.0107 | radius_worst |
| 0.0016 ± 0.0025 | symmetry_worst |
| 0.0009 ± 0.0012 | texture_mean |
| 0.0007 ± 0.0028 | radius_mean |
| 0.0006 ± 0.0006 | area_mean |
| 0.0005 ± 0.0018 | symmetry_mean |
| 0.0003 ± 0.0003 | perimeter_se |
| 0.0002 ± 0.0026 | smoothness_worst |
| 0.0002 ± 0.0008 | fractal_dimension_worst |
| … 10 more … | |
## dataframe as per feature selection from permutation importance
temp_X_df = df.drop(columns ='diagnosis').copy()
temp_X_df = temp_X_df[perm_imp_feats_auc]
temp_y_df = df['diagnosis']
## crossvalidation with repeated feature selection
stratified = StratifiedKFold(n_splits = 5,shuffle = True, random_state = 2021)
feat_acc = []
feat_auc = []
feat_f1 = []
for idx,feat in enumerate (perm_imp_feats_auc):
temp = temp_X_df.iloc[:,:idx]
temp['all_other'] = temp_X_df.iloc[:,idx:len(perm_imp_feats_auc)].sum(axis = 1)
X_ = temp
y_ = temp_y_df
if idx == 0:
continue
else:
fold_acc = []
fold_auc = []
fold_f1 = []
for train_idx,valid_idx in stratified.split(X_,y_):
xtrain,xvalid = X_.iloc[train_idx],X_.iloc[valid_idx]
ytrain,yvalid = y_.iloc[train_idx],y_.iloc[valid_idx]
model = XGBClassifier(eval_metric = 'logloss').fit(xtrain.values,ytrain.values)
preds = model.predict(xvalid.values)
acc_score= accuracy_score(yvalid,preds)
auc_score = roc_auc_score(yvalid,preds)
f1 = f1_score(yvalid,preds)
fold_acc.append(acc_score)
fold_auc.append(auc_score)
fold_f1.append(f1)
feat_acc.append(round(np.mean(fold_acc),2))
feat_auc.append(round(np.mean(fold_auc),2))
feat_f1.append(round(np.mean(fold_f1),2))
fig,ax = plt.subplots(1,2,figsize =(12,6))
## accuracy vs number of features from permutation importance based feature selection
ax[0].plot(np.arange(0,len(feat_acc),1),feat_acc, color = colors[0], linewidth = 2)
ax[0].scatter(x =np.arange(0,len(feat_acc),1),y=feat_acc,
color = colors[1], s = 75,zorder = 3,
linewidth = 1,ec = 'black')
ax[0].set_ylabel('Cross-Validation Accuracy Mean',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)
ax[0].set_xlabel('Number of Features',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)
## area under curve vs number of features from permutation importance based feature selection
ax[1].plot(np.arange(0,len(feat_auc),1),feat_auc, color=colors[0], linewidth = 2)
ax[1].scatter(x =np.arange(0,len(feat_auc),1),y=feat_auc,
color= colors[2], s= 75,zorder =3,
linewidth = 1,ec = 'black')
ax[1].set_ylabel('Cross-Validation AUC Mean',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)
ax[1].set_xlabel('Number of Features',{'fontfamily':'serif','size':12, 'weight':'bold'}, alpha = 0.95)
### title and annotations
## titles and text
fig.text(-0.05,1.18,' Influence of Number of Features on Metric', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(-0.05,1.07,'''This plot shows clearly that, even single feature from feature selection is giving
0.92 accuracy, and with increase in number of features accuracy and auc increased.
But not after 10 to 15 featrues theres in nothing much of change.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.95)
fig.text(0.35,0.99, "Accuracy",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[1]})
fig.text(0.50,0.99, '|',{'fontfamily':'serif','size':14, 'weight':'bold'})
fig.text(0.55,0.99, "Area Under Curve",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.70,-0.01,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)
fig.tight_layout(pad = 2.5, w_pad = 2.5)
fig.show()
temp_df = temp_X_df.iloc[:,0:14]
temp_df['all_other'] = temp_X_df.iloc[:,14:len(perm_imp_feats_auc)].sum(axis = 1)
cols = temp_df.columns
temp_xtrain,temp_xtest, temp_ytrain,temp_ytest = train_test_split(temp_df, temp_y_df, test_size = 0.2)
temp_model = XGBClassifier(eval_metric = 'logloss')
temp_model.fit(temp_xtrain,temp_ytrain)
### shapvalues
explainer = shap.TreeExplainer(temp_model)
shap_values = explainer.shap_values(temp_xtest)
cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[1],colors[2],colors[3]])
shap.summary_plot(shap_values,temp_xtest,
show = False,cmap = cmap)
# plot settings
## titles and text
plt.gcf().text(-0.1,1.1,' SHAP Values and Features', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
plt.gcf().text(-0.1,0.98,'''This visualizaiton enables us to understand the feature importance
and global interpretation. ...''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.95)
plt.gcf().text(0.65,-0.01,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)
plt.gcf().show()
### helper function
def plot_feat(axes_idx = None, data_ = None, scaler_method = None, color = None):
col_names = data_.columns
scaled_array = scaler_method.fit_transform( data_)
# scaled dataframe
scaled_df = pd.DataFrame(scaled_array, columns=col_names)
skew_scaler = []
for idx, col in zip(axes_idx, col_names):
col_skew = skew(scaled_df[col])
if col_skew > 1.5:
temp = np.log1p(scaled_df[col] + 0.5)
else:
temp = scaled_df[col]
## plot
sns.kdeplot(x = temp, ax = axes[idx],
color = color,fill = True, alpha = 1,
linewidth = 3,ec = 'black')
skew_scaler.append(col_skew)
xlabel = ' '.join([value.capitalize() for value in str(col).split('_') ])
#ax.set_facecolor(colors[-1])
axes[idx].axes.get_yaxis().set_visible(False)
axes[idx].axes.set_xlabel(xlabel,{'fontfamily':'serif','size':10, 'weight':'bold'}, alpha = 1)
return skew_scaler
### Scalers ans axis indicies
scaler_list = [StandardScaler(), MinMaxScaler(), RobustScaler()]
axes_np_list = [np.arange(0,30,3).tolist(), np.arange(1,30,3).tolist(), np.arange(2,30,3).tolist()]
colors_list = [colors[0],colors[1],colors[2]]
data = temp_X_df.iloc[:,0:14]
## plotting
fig,ax = plt.subplots(15,3, figsize = (10,20))
axes = ax.ravel()
scaler_skews = []
for axes_idx_list,scaler, color in zip(axes_np_list,scaler_list, colors_list):
skewness = plot_feat(axes_idx = axes_idx_list, data_ = data, scaler_method = scaler,color=color)
scaler_skews.append(skewness)
plt.tight_layout()
## titles and text
fig.text(0,1.045,' Influence of Scaling on Data', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
#fig.text(0,1.02,'''Three Common approches for Data Scaling are explored here...As all the
#outliers are removed and we couldnt expect much of change... ''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 1)
fig.text(0.22,1.005, "Standardization\nStandardScaler",{'fontfamily':'serif','size':14, 'weight':'bold', 'color':colors[0]})
fig.text(0.44,1.01, '|',{'fontfamily':'serif','size':27, 'weight':'bold'})
fig.text(0.50,1.005, "Normalization\nMinMaxScaler",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[1]})
fig.text(0.72,1.01, '|',{'fontfamily':'serif','size':27, 'weight':'bold'})
fig.text(0.76,1.005, "OutlierRemoval\nRobustScaler",{'fontfamily':'serif','size':14, 'weight':'bold','color':colors[2]})
fig.text(0.73,0,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.8)
fig.show()
print(color_class.BOLD_COLOR + '\nFinal Data Scaling with StandardScaler.....\n'+color_class.END)
## final data
xdata= df.drop(columns = ['diagnosis'])
xdata = xdata[perm_imp_feats_auc].iloc[:,0:15]
ydata = df['diagnosis']
## final data shapes
print(color_class.BOLD + '\nShape of features Data: '+color_class.END+\
color_class.BOLD_COLOR+ str(xdata.shape) + color_class.END)
print(color_class.BOLD + 'Shape of target Data: ' + color_class.END+\
color_class.BOLD_COLOR+ str(ydata.shape) + color_class.END)
print(color_class.BOLD_COLOR+ '\nAll set for final modeling...\n' + color_class.END)
Final Data Scaling with StandardScaler..... Shape of features Data: (540, 15) Shape of target Data: (540,) All set for final modeling...
classifiers = []
classifiers.append(LogisticRegression(random_state = 2021))
classifiers.append(SVC(random_state=2021, probability = True))
classifiers.append(KNeighborsClassifier())
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=2021)))
classifiers.append(RandomForestClassifier(random_state=2021))
classifiers.append(GradientBoostingClassifier(random_state=2021))
classifiers.append(ExtraTreesClassifier(random_state= 2021))
classifiers.append(XGBClassifier(random_state = 2021,eval_metric = 'logloss'))
classifiers.append(LGBMClassifier(random_state = 2021))
stratified = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 20)
Algorithms = ["Logistic","SVC","KNeighbors","AdaBoost",
"RandomForest","GradientBoosting",
"ExtraTrees","XGBoost", "LightGBM"]
## empty lists to stores values and states
class_accuracy = []
class_f1 = []
class_auc = []
class_preds = []
class_class_states= []
class_valid_truths = []
class_valid_features = []
class_cm = []
for classifier,algo in zip(classifiers,Algorithms):
fold_accuracy = []
fold_f1 = []
fold_roc_auc = []
fold_preds = []
fold_class_states = []
fold_valid_truths = []
fold_valid_features = []
fold_cm = []
n = 0
print(color_class.BOLD + '*'*17+ color_class.END + color_class.BOLD_COLOR + str(algo) + color_class.END + color_class.BOLD + '*'*17 + color_class.END)
for train_idx,valid_idx in stratified.split(xdata,ydata):
xtrain,xvalid = xdata.iloc[train_idx],xdata.iloc[valid_idx]
ytrain,yvalid = ydata.iloc[train_idx],ydata.iloc[valid_idx]
## scaling
ss = StandardScaler()
xtrain = ss.fit_transform(xtrain)
xvalid = ss.transform(xvalid)
# model
model = classifier
model.fit(xtrain,ytrain)
preds = model.predict(xvalid)
## scores
#### fold results, feaures,preds, states
accuracy = accuracy_score(yvalid, preds)
f1 = f1_score(yvalid,preds)
roc_auc = roc_auc_score(yvalid,preds)
cm = confusion_matrix(yvalid,preds)
fold_accuracy.append(accuracy)
fold_f1.append(f1)
fold_roc_auc.append(roc_auc)
fold_preds.append(preds)
fold_class_states.append(model)
fold_valid_truths.append(np.array(yvalid).astype(int))
fold_valid_features.append(xvalid)
fold_cm.append(cm)
## printing results
print(color_class.BOLD)
print("fold{}: Accuracy: {}, F1:{}, Roc_Auc: {} ".format(n, round(accuracy,2),round(f1,2),round(roc_auc,2)))
print(color_class.END)
n+=1
#### class results, feaures,preds, states
class_accuracy.append(fold_accuracy)
class_f1.append(fold_f1)
class_auc.append(fold_auc)
class_preds.append(fold_preds)
class_valid_truths.append(fold_valid_truths)
class_valid_features.append(fold_valid_features)
class_cm.append(fold_cm)
class_class_states.append(fold_class_states)
## breif result dynamic prints
print( color_class.BOLD+ '\n'+'*'*10 +'Means'+ '*'*10+'\n' + color_class.END)
print(color_class.BOLD_COLOR)
print('Accuracy Mean: {}'.format(round(np.mean(fold_accuracy),2)))
print('F1 Mean: {}'.format(round(np.mean(fold_f1),2)))
print('ROC_AUC Mean: {}'.format(round(np.mean(fold_roc_auc),2)))
print(color_class.END)
print('\n'+ color_class.BOLD+'*'*30 + color_class.END +'\n')
*****************Logistic***************** fold0: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.95 fold1: Accuracy: 0.99, F1:0.99, Roc_Auc: 0.99 fold2: Accuracy: 0.99, F1:0.99, Roc_Auc: 0.99 fold3: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 fold4: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.96 **********Means********** Accuracy Mean: 0.98 F1 Mean: 0.96 ROC_AUC Mean: 0.97 ****************************** *****************SVC***************** fold0: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 fold1: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 fold2: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.99 fold3: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 fold4: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.95 **********Means********** Accuracy Mean: 0.97 F1 Mean: 0.96 ROC_AUC Mean: 0.97 ****************************** *****************KNeighbors***************** fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 fold1: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.94 fold2: Accuracy: 0.99, F1:0.99, Roc_Auc: 0.99 fold3: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 fold4: Accuracy: 0.94, F1:0.92, Roc_Auc: 0.94 **********Means********** Accuracy Mean: 0.97 F1 Mean: 0.95 ROC_AUC Mean: 0.96 ****************************** *****************AdaBoost***************** fold0: Accuracy: 0.92, F1:0.88, Roc_Auc: 0.9 fold1: Accuracy: 0.94, F1:0.91, Roc_Auc: 0.93 fold2: Accuracy: 0.92, F1:0.89, Roc_Auc: 0.94 fold3: Accuracy: 0.92, F1:0.88, Roc_Auc: 0.91 fold4: Accuracy: 0.95, F1:0.94, Roc_Auc: 0.96 **********Means********** Accuracy Mean: 0.93 F1 Mean: 0.9 ROC_AUC Mean: 0.93 ****************************** *****************RandomForest***************** fold0: Accuracy: 0.98, F1:0.97, Roc_Auc: 0.97 fold1: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 fold2: Accuracy: 0.95, F1:0.94, Roc_Auc: 0.96 fold3: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.94 fold4: Accuracy: 0.94, F1:0.92, Roc_Auc: 0.94 **********Means********** Accuracy Mean: 0.96 F1 Mean: 0.94 ROC_AUC Mean: 0.95 ****************************** *****************GradientBoosting***************** fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 fold1: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 fold2: Accuracy: 0.94, F1:0.93, Roc_Auc: 0.96 fold3: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 fold4: Accuracy: 0.94, F1:0.92, Roc_Auc: 0.94 **********Means********** Accuracy Mean: 0.96 F1 Mean: 0.94 ROC_AUC Mean: 0.95 ****************************** *****************ExtraTrees***************** fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.97 fold1: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.93 fold2: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.98 fold3: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.97 fold4: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.95 **********Means********** Accuracy Mean: 0.96 F1 Mean: 0.95 ROC_AUC Mean: 0.96 ****************************** *****************XGBoost***************** fold0: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.95 fold1: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 fold2: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.97 fold3: Accuracy: 0.96, F1:0.94, Roc_Auc: 0.95 fold4: Accuracy: 0.95, F1:0.93, Roc_Auc: 0.95 **********Means********** Accuracy Mean: 0.96 F1 Mean: 0.94 ROC_AUC Mean: 0.95 ****************************** *****************LightGBM***************** fold0: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 fold1: Accuracy: 0.94, F1:0.9, Roc_Auc: 0.91 fold2: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.97 fold3: Accuracy: 0.97, F1:0.96, Roc_Auc: 0.96 fold4: Accuracy: 0.96, F1:0.95, Roc_Auc: 0.97 **********Means********** Accuracy Mean: 0.96 F1 Mean: 0.94 ROC_AUC Mean: 0.95 ******************************
print(color_class.BOLD_COLOR+ 'Storing results into dataframe....\n'+ color_class.END)
results_df = (pd.DataFrame({'Algorithms': Algorithms,
'Mean Accuracy':np.row_stack(class_accuracy).mean(axis = 1),
'Mean F1':np.row_stack(class_f1).mean(axis = 1),
'Mean Roc_Auc':np.row_stack(class_accuracy).mean(axis = 1),
'Classifier Preds':class_preds,
'Classifier Valid Truths':class_valid_truths,
'Classifier Valid Features':class_valid_features,
'Classifier CM':class_cm,
'Classifier States':class_class_states})
.sort_values(by = 'Mean F1',ascending =True)
.reset_index(drop = True))
results_df['Preds_array'] = results_df['Classifier Preds'].apply(lambda x: np.array(x).ravel())
results_df['Truths_array'] = results_df['Classifier Valid Truths'].apply(lambda x: np.array(x).ravel())
print(color_class.BOLD)
print(results_df.head(1).T)
Storing results into dataframe.... 0 Algorithms AdaBoost Mean Accuracy 0.927778 Mean F1 0.898997 Mean Roc_Auc 0.927778 Classifier Preds [[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0,... Classifier Valid Truths [[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,... Classifier Valid Features [[[0.345677598557644, 0.10977870176279673, 0.0... Classifier CM [[[67, 3], [6, 32]], [[66, 4], [3, 35]], [[61,... Classifier States [(DecisionTreeClassifier(random_state=96986675... Preds_array [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ... Truths_array [1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, ...
fig, ax = plt.subplots(1,2,figsize = (18,10))
axes = ax.ravel()
axes[0].invert_xaxis()
color_list = results_df['Mean Accuracy'].apply(lambda x: colors[2])
axes[0].barh(y = results_df['Algorithms'], width = round(results_df['Mean Accuracy'],3), height = 0.5, color = color_list)
for pa in ax[0].patches:
ax[0].text(pa.get_width(),pa.get_y()+pa.get_height()/2, pa.get_width(), ha = 'right', va = 'center',
**{'fontfamily':'serif','size':10,'weight':'bold'})
color_list1 = results_df['Mean F1'].apply(lambda x: colors[0] )
axes[1].barh(y = results_df['Algorithms'], width = round(results_df['Mean F1'],3), height = 0.5, color = color_list1)
for pa in ax[1].patches:
ax[1].text(pa.get_width(),pa.get_y()+pa.get_height()/2, pa.get_width(), ha = 'left',va = 'center',
**{'fontfamily':'serif','size':10,'weight':'bold'})
axes[0].set_yticklabels('')
axes[1].set_yticklabels(results_df['Algorithms'], {'fontfamily':'serif','size':12,'weight':'bold'},rotation = 0,ha= 'center')
axes[1].tick_params(axis = 'y',pad = 75)
axes[0].set_xticklabels('')
axes[1].set_xticklabels('')
## titles and text
fig.text(0,0.945,' Crossvalidation Fold Means and Classifiers', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,0.89,'''It seems both Logitsticregression,and SVC classifiers are doing best job. Even F1 score is
good for the given models. Adaboost and Decision Tree are kind of over fitted data.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.9)
fig.text(0.15,0.825, "Fold Accuracy Mean",{'fontfamily':'serif','size':18, 'weight':'bold', 'color':'black'})
fig.text(0.5,0.825, '|',{'fontfamily':'serif','size':24, 'weight':'bold'})
fig.text(0.65,0.825, "Fold F1 Score Mean",{'fontfamily':'serif','size':18, 'weight':'bold','color':'black'})
fig.text(0.72,0.15,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)
fig.tight_layout(pad = 10,w_pad = 1, h_pad = 10)
fig.show()
my_cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[0],colors[1],colors[2]])
fig,ax = plt.subplots(3,3, figsize=(10,10))
for truth,pred,axes,algo in zip(results_df['Truths_array'],
results_df['Preds_array'],
ax.ravel(),results_df['Algorithms']):
cf_mat = confusion_matrix(truth,pred)
#### annotations
labels = ['True Neg','False Pos','False Neg','True Pos']
counts = ["{0:0.0f}".format(value) for value in cf_mat.flatten()]
percentages = ["{0:.2%}".format(value) for value in cf_mat.flatten()/np.sum(cf_mat)]
#### final annotations
label = (np.array([f'{v1}\n{v2}\n{v3}' for v1,v2,v3 in zip(labels,counts,percentages)])).reshape(2,2)
#### scores
f1 = f1_score(truth,pred)
auc = roc_auc_score(truth,pred)
#heatmap
sns.heatmap(data = cf_mat, vmin = 0, vmax =330, cmap = my_cmap,linewidth=2,linecolor = 'white',square = True,
ax = axes, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':10, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
axes.text(0,-0,'{}'.format(algo),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
axes.scatter( 1 , 1 , s = 3500, c = 'white')
axes.text(0.72,1.1, ' F1: {}\nAUC: {}'.format(round(f1,2), round(auc,2)),{'fontfamily':'serif','size':10, 'color':'black', 'weight':'bold'})
## ticks and labels
axes.set_xticklabels('')
axes.set_yticklabels('')
## titles and text
fig.text(0,1.05,' Crossvalidataion Results', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,1,'''This Visualization show the results of various classifiers and there respective
results.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.98)
fig.text(0.72,0.,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)
fig.tight_layout(pad = 2.5, w_pad = 2.5,h_pad = 2.5)
fig.show()
print(color_class.BOLD + 'Hyperparameters tunning grid...'+ color_class.END)
classifiers_params = {
LogisticRegression(): {'C':[0.001,0.01,0.05,0.1,0.5,1,10,100,200,1000] ,
'penalty': ['l1','l2']} ,
SVC(): {'C':[0.001,0.01,0.05,0.1,0.5,1,10,100,200,1000] } ,
LGBMClassifier(): {
'class_weight': [{1:6,0:4},{1:7,0:3},{1:8,0:4}],
'n_estimators': np.arange(100,3000,250),
'num_leaves': np.arange(10,50,10),
'learning_rate': [0.01,0.05,0.1,0.5]},
RandomForestClassifier() : {
'class_weight': [{1:6,0:4}],
'max_depth': [2,4,6,8,10],
'max_leaf_nodes': [5,10,15],
'n_estimators': np.arange(100,2000,500)} ,
AdaBoostClassifier() : {
'base_estimator': [DecisionTreeClassifier()],
'learning_rate': [0.01,0.05,0.1],
'n_estimators': np.arange(100,1000,500)} ,
}
Hyperparameters tunning grid...
print(color_class.BOLD_COLOR+ 'Gridsearch CV implementation with predefined grid params'+ color_class.END +'\n')
xtrain,xtest,ytrain,ytest = train_test_split(xdata,ydata,random_state = 2021,shuffle = True,stratify= ydata)
ss = StandardScaler()
xtrain = ss.fit_transform(xtrain)
xtest = ss.transform(xtest)
best_est = []
best_pms = []
for clf,params in classifiers_params.items():
print(color_class.BOLD_COLOR + '*'*20 + color_class.END +'\n')
gs = GridSearchCV(estimator= clf, param_grid = params,cv = stratified, verbose= 2,scoring = 'roc_auc',n_jobs = -1)
#gs = clf
gs.fit(xtrain,ytrain)
best_estimator = gs.best_estimator_
best_params = gs.best_params_
best_est.append(best_estimator)
best_pms.append(best_params)
preds = best_estimator.predict(xtest)
print(color_class.BOLD)
print(best_estimator)
print('Accuracy: {}'.format(accuracy_score(ytest,preds)))
print('Roc_Auc: {}'.format(round(roc_auc_score(ytest,preds),8)))
Gridsearch CV implementation with predefined grid params ******************** Fitting 5 folds for each of 20 candidates, totalling 100 fits LogisticRegression(C=1) Accuracy: 0.9629629629629629 Roc_Auc: 0.95176499 ******************** Fitting 5 folds for each of 10 candidates, totalling 50 fits SVC(C=1) Accuracy: 0.9629629629629629 Roc_Auc: 0.95176499 ******************** Fitting 5 folds for each of 576 candidates, totalling 2880 fits LGBMClassifier(class_weight={0: 4, 1: 8}, learning_rate=0.01, n_estimators=2600, num_leaves=20) Accuracy: 0.9629629629629629 Roc_Auc: 0.94680851 ******************** Fitting 5 folds for each of 60 candidates, totalling 300 fits RandomForestClassifier(class_weight={0: 4, 1: 6}, max_depth=8, max_leaf_nodes=15, n_estimators=600) Accuracy: 0.9629629629629629 Roc_Auc: 0.94680851 ******************** Fitting 5 folds for each of 6 candidates, totalling 30 fits AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), learning_rate=0.05, n_estimators=100) Accuracy: 0.9185185185185185 Roc_Auc: 0.90280464
print(color_class.BOLD_COLOR+ 'Results appending from gridsearch...'+'\n'+ color_class.END)
acc_list = []
cm_list = []
f1_list = []
auc_list = []
for clf in best_est:
preds = clf.predict(xtest)
cm = confusion_matrix(ytest,preds)
acc = accuracy_score(ytest,preds)
f1 = f1_score(ytest,preds)
auc = roc_auc_score(ytest,preds)
cm_list.append(cm)
acc_list.append(acc)
f1_list.append(f1)
auc_list.append(auc)
hyper_results_df = pd.DataFrame({'Algorithms':['LogisticRegression', 'SVC','LGBMClassifier','RandomForestClassifier','AdaBoostClassifier'],
'Accuracy':acc_list,
'f1_score':f1_list,
'roc_auc_score':auc_list,
'confusion_matrix':cm_list})
print(color_class.BOLD + '\n')
print(hyper_results_df.head())
Results appending from gridsearch... Algorithms Accuracy f1_score roc_auc_score \ 0 LogisticRegression 0.962963 0.945055 0.951765 1 SVC 0.962963 0.945055 0.951765 2 LGBMClassifier 0.962963 0.943820 0.946809 3 RandomForestClassifier 0.962963 0.943820 0.946809 4 AdaBoostClassifier 0.918519 0.879121 0.902805 confusion_matrix 0 [[87, 1], [4, 43]] 1 [[87, 1], [4, 43]] 2 [[88, 0], [5, 42]] 3 [[88, 0], [5, 42]] 4 [[84, 4], [7, 40]]
my_cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[0],colors[1],colors[2]])
fig,ax = plt.subplots(2,2, figsize=(8,8))
for algo,f1,auc,cm,axes in zip(hyper_results_df['Algorithms'],
hyper_results_df['f1_score'],
hyper_results_df['roc_auc_score'],
hyper_results_df['confusion_matrix'],
ax.ravel()):
cf_mat = cm
#### annotations
labels = ['True Neg','False Pos','False Neg','True Pos']
counts = ["{0:0.0f}".format(value) for value in cf_mat.flatten()]
percentages = ["{0:.2%}".format(value) for value in cf_mat.flatten()/np.sum(cf_mat)]
#### final annotations
label = (np.array([f'{v1}\n{v2}\n{v3}' for v1,v2,v3 in zip(labels,counts,percentages)])).reshape(2,2)
#### scores
#heatmap
sns.heatmap(data = cf_mat, vmin = 0, vmax =84, cmap = my_cmap,linewidth=2,linecolor = 'white',square = True,
ax = axes, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':10, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
axes.text(0,-0,'{}'.format(algo),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
axes.scatter( 1 , 1 , s = 3500, c = 'white')
axes.text(0.72,1.1, ' F1: {}\nAUC: {}'.format(round(f1,2), round(auc,2)),{'fontfamily':'serif','size':10, 'color':'black', 'weight':'bold'})
## ticks and labels
axes.set_xticklabels('')
axes.set_yticklabels('')
## titles and text
fig.text(0,1.05,' GridSearch Results', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,1,'''This Visualization show the results of various classifiers and there respective
results.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.98)
fig.text(0.72,0.,'© Made by Milon',{'fontfamily':'serif', 'size':10,'weight':'bold'}, alpha = 0.85)
fig.tight_layout(pad = 2.5, w_pad = 2.5,h_pad = 2.5)
fig.show()
models = best_est
stack_train,stack_test = stacking(models = best_est,
X_train = xtrain,
y_train = ytrain,
X_test = xtest,
regression = False,
metric = 'roc_auc',
n_folds = 5,shuffle = True,
stratified = True)
fin_model = XGBClassifier(eval_metric='logloss')
fin_model.fit(stack_train, ytrain)
fin_preds = fin_model.predict(stack_test)
print(color_class.BOLD_COLOR+'Stacked Classification...'+'\n'+ color_class.END)
print(color_class.BOLD)
print('accuracy: {}'.format(round(accuracy_score(ytest,fin_preds),3)))
print('roc_auc: {}'.format(round(roc_auc_score(ytest,fin_preds),3)))
print('f1:{}'.format(round(f1_score(ytest,fin_preds),3)))
Stacked Classification... accuracy: 0.963 roc_auc: 0.952 f1:0.945
models
[LogisticRegression(C=1),
SVC(C=1),
LGBMClassifier(class_weight={0: 4, 1: 8}, learning_rate=0.01, n_estimators=2600,
num_leaves=20),
RandomForestClassifier(class_weight={0: 4, 1: 6}, max_depth=8,
max_leaf_nodes=15, n_estimators=600),
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), learning_rate=0.05,
n_estimators=100)]
# my_cmap = mpl.colors.LinearSegmentedColormap.from_list("",[colors[0],colors[1],colors[2]])
fig,ax = plt.subplots(figsize=(8,8))
cf_mat = confusion_matrix(ytest,fin_preds)
f1 = f1_score(ytest,fin_preds)
auc = roc_auc_score(ytest,fin_preds)
#### annotations
labels = ['True Neg','False Pos','False Neg','True Pos']
counts = ["{0:0.0f}".format(value) for value in cf_mat.flatten()]
percentages = ["{0:.2%}".format(value) for value in cf_mat.flatten()/np.sum(cf_mat)]
#### final annotations
label = (np.array([f'{v1}\n{v2}\n{v3}' for v1,v2,v3 in zip(labels,counts,percentages)])).reshape(2,2)
#### scores
#heatmap
sns.heatmap(data = cf_mat, vmin = 0, vmax =84, cmap = my_cmap,linewidth=2,linecolor = 'white',square = True,
ax = ax, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':12, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
# ax = ax, annot = label, fmt ='', cbar = False, annot_kws = {'fontfamily':'serif','size':12, 'color':'black','weight':'bold','alpha':0.8}, alpha =1)
ax.text(0,-0,'{}'.format('Stacked Classification'),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
ax.scatter( 1 , 1 , s = 5000, c = 'white')
ax.text(0.85,1.05, ' F1: {}\nAUC: {}'.format(round(f1,2), round(auc,2)),{'fontfamily':'serif','size':12, 'color':'black', 'weight':'bold'})
## ticks and labels
ax.set_xticklabels('')
ax.set_yticklabels('')
## titles and text
fig.text(0,1.05,' Stacked Classification Results', {'fontfamily':'serif','size':22, 'weight':'bold'}, alpha = 1)
fig.text(0,1,'''This Visualization show the results of stacked classification and there respective
results.''',{'fontfamily':'serif','size':14, 'weight':'normal'}, alpha = 0.98)
fig.text(0.72,0.,'© Made by Milon',{'fontfamily':'serif', 'size':12,'weight':'bold'}, alpha = 0.85)
fig.tight_layout(pad = 2.5, w_pad = 2.5,h_pad = 2.5)
fig.show()